* [PATCH v9 01/37] backends/iommufd: Introduce iommufd_backend_alloc_viommu
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
@ 2026-01-26 10:42 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:42 ` [PATCH v9 02/37] backends/iommufd: Introduce iommufd_backend_alloc_vdev Shameer Kolothum
` (36 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:42 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
From: Nicolin Chen <nicolinc@nvidia.com>
Add a helper to allocate a viommu object.
Also introduce a struct IOMMUFDViommu that can be used later by vendor
IOMMU implementations.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
backends/iommufd.c | 26 ++++++++++++++++++++++++++
backends/trace-events | 1 +
include/system/iommufd.h | 14 ++++++++++++++
3 files changed, 41 insertions(+)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 086bd67aea..c65dc41d71 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -447,6 +447,32 @@ bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id,
return !ret;
}
+bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
+ uint32_t viommu_type, uint32_t hwpt_id,
+ uint32_t *out_viommu_id, Error **errp)
+{
+ int ret;
+ struct iommu_viommu_alloc alloc_viommu = {
+ .size = sizeof(alloc_viommu),
+ .type = viommu_type,
+ .dev_id = dev_id,
+ .hwpt_id = hwpt_id,
+ };
+
+ ret = ioctl(be->fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu);
+
+ trace_iommufd_backend_alloc_viommu(be->fd, dev_id, viommu_type, hwpt_id,
+ alloc_viommu.out_viommu_id, ret);
+ if (ret) {
+ error_setg_errno(errp, errno, "IOMMU_VIOMMU_ALLOC failed");
+ return false;
+ }
+
+ g_assert(out_viommu_id);
+ *out_viommu_id = alloc_viommu.out_viommu_id;
+ return true;
+}
+
bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
uint32_t hwpt_id, Error **errp)
{
diff --git a/backends/trace-events b/backends/trace-events
index e1992ba12f..1ae94c4290 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -21,3 +21,4 @@ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%
iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)"
iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t flags, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" flags=0x%"PRIx64" page_size=0x%"PRIx64" (%d)"
iommufd_backend_invalidate_cache(int iommufd, uint32_t id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_viommu(int iommufd, uint32_t dev_id, uint32_t type, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)"
diff --git a/include/system/iommufd.h b/include/system/iommufd.h
index 1392706b83..bc4acd1e8b 100644
--- a/include/system/iommufd.h
+++ b/include/system/iommufd.h
@@ -38,6 +38,16 @@ struct IOMMUFDBackend {
/*< public >*/
};
+/*
+ * Virtual IOMMU object that represents physical IOMMU's virtualization
+ * support
+ */
+typedef struct IOMMUFDViommu {
+ IOMMUFDBackend *iommufd;
+ uint32_t s2_hwpt_id; /* ID of stage 2 HWPT */
+ uint32_t viommu_id; /* virtual IOMMU ID of allocated object */
+} IOMMUFDViommu;
+
bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
void iommufd_backend_disconnect(IOMMUFDBackend *be);
@@ -59,6 +69,10 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
uint32_t data_type, uint32_t data_len,
void *data_ptr, uint32_t *out_hwpt,
Error **errp);
+bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
+ uint32_t viommu_type, uint32_t hwpt_id,
+ uint32_t *out_hwpt, Error **errp);
+
bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id,
bool start, Error **errp);
bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id,
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 01/37] backends/iommufd: Introduce iommufd_backend_alloc_viommu
2026-01-26 10:42 ` [PATCH v9 01/37] backends/iommufd: Introduce iommufd_backend_alloc_viommu Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju
On 2026/1/26 18:42, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> Add a helper to allocate a viommu object.
>
> Also introduce a struct IOMMUFDViommu that can be used later by vendor
> IOMMU implementations.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> backends/iommufd.c | 26 ++++++++++++++++++++++++++
> backends/trace-events | 1 +
> include/system/iommufd.h | 14 ++++++++++++++
> 3 files changed, 41 insertions(+)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index 086bd67aea..c65dc41d71 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -447,6 +447,32 @@ bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id,
> return !ret;
> }
>
> +bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
> + uint32_t viommu_type, uint32_t hwpt_id,
> + uint32_t *out_viommu_id, Error **errp)
> +{
> + int ret;
> + struct iommu_viommu_alloc alloc_viommu = {
> + .size = sizeof(alloc_viommu),
> + .type = viommu_type,
> + .dev_id = dev_id,
> + .hwpt_id = hwpt_id,
> + };
> +
> + ret = ioctl(be->fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu);
> +
> + trace_iommufd_backend_alloc_viommu(be->fd, dev_id, viommu_type, hwpt_id,
> + alloc_viommu.out_viommu_id, ret);
> + if (ret) {
> + error_setg_errno(errp, errno, "IOMMU_VIOMMU_ALLOC failed");
> + return false;
> + }
> +
> + g_assert(out_viommu_id);
> + *out_viommu_id = alloc_viommu.out_viommu_id;
> + return true;
> +}
> +
> bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> uint32_t hwpt_id, Error **errp)
> {
> diff --git a/backends/trace-events b/backends/trace-events
> index e1992ba12f..1ae94c4290 100644
> --- a/backends/trace-events
> +++ b/backends/trace-events
> @@ -21,3 +21,4 @@ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%
> iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)"
> iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t flags, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" flags=0x%"PRIx64" page_size=0x%"PRIx64" (%d)"
> iommufd_backend_invalidate_cache(int iommufd, uint32_t id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
> +iommufd_backend_alloc_viommu(int iommufd, uint32_t dev_id, uint32_t type, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)"
> diff --git a/include/system/iommufd.h b/include/system/iommufd.h
> index 1392706b83..bc4acd1e8b 100644
> --- a/include/system/iommufd.h
> +++ b/include/system/iommufd.h
> @@ -38,6 +38,16 @@ struct IOMMUFDBackend {
> /*< public >*/
> };
>
> +/*
> + * Virtual IOMMU object that represents physical IOMMU's virtualization
> + * support
> + */
> +typedef struct IOMMUFDViommu {
> + IOMMUFDBackend *iommufd;
> + uint32_t s2_hwpt_id; /* ID of stage 2 HWPT */
> + uint32_t viommu_id; /* virtual IOMMU ID of allocated object */
> +} IOMMUFDViommu;
> +
> bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
> void iommufd_backend_disconnect(IOMMUFDBackend *be);
>
> @@ -59,6 +69,10 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
> uint32_t data_type, uint32_t data_len,
> void *data_ptr, uint32_t *out_hwpt,
> Error **errp);
> +bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
> + uint32_t viommu_type, uint32_t hwpt_id,
> + uint32_t *out_hwpt, Error **errp);
> +
> bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id,
> bool start, Error **errp);
> bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id,
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 02/37] backends/iommufd: Introduce iommufd_backend_alloc_vdev
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
2026-01-26 10:42 ` [PATCH v9 01/37] backends/iommufd: Introduce iommufd_backend_alloc_viommu Shameer Kolothum
@ 2026-01-26 10:42 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 03/37] hw/arm/smmu-common: Factor out common helper functions and export Shameer Kolothum
` (35 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:42 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
From: Nicolin Chen <nicolinc@nvidia.com>
Add a helper to allocate an iommufd device's virtual device (in the user
space) per a viommu instance.
While at it, introduce a struct IOMMUFDVdev for later use by vendor
IOMMU implementations.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
backends/iommufd.c | 27 +++++++++++++++++++++++++++
backends/trace-events | 1 +
include/system/iommufd.h | 12 ++++++++++++
3 files changed, 40 insertions(+)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index c65dc41d71..e3a3c1480e 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -473,6 +473,33 @@ bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
return true;
}
+bool iommufd_backend_alloc_vdev(IOMMUFDBackend *be, uint32_t dev_id,
+ uint32_t viommu_id, uint64_t virt_id,
+ uint32_t *out_vdev_id, Error **errp)
+{
+ int ret;
+ struct iommu_vdevice_alloc alloc_vdev = {
+ .size = sizeof(alloc_vdev),
+ .viommu_id = viommu_id,
+ .dev_id = dev_id,
+ .virt_id = virt_id,
+ };
+
+ ret = ioctl(be->fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev);
+
+ trace_iommufd_backend_alloc_vdev(be->fd, dev_id, viommu_id, virt_id,
+ alloc_vdev.out_vdevice_id, ret);
+
+ if (ret) {
+ error_setg_errno(errp, errno, "IOMMU_VDEVICE_ALLOC failed");
+ return false;
+ }
+
+ g_assert(out_vdev_id);
+ *out_vdev_id = alloc_vdev.out_vdevice_id;
+ return true;
+}
+
bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
uint32_t hwpt_id, Error **errp)
{
diff --git a/backends/trace-events b/backends/trace-events
index 1ae94c4290..14a7ecf5aa 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -22,3 +22,4 @@ iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) "
iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t flags, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" flags=0x%"PRIx64" page_size=0x%"PRIx64" (%d)"
iommufd_backend_invalidate_cache(int iommufd, uint32_t id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
iommufd_backend_alloc_viommu(int iommufd, uint32_t dev_id, uint32_t type, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)"
+iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)"
diff --git a/include/system/iommufd.h b/include/system/iommufd.h
index bc4acd1e8b..567dfb7b1d 100644
--- a/include/system/iommufd.h
+++ b/include/system/iommufd.h
@@ -48,6 +48,14 @@ typedef struct IOMMUFDViommu {
uint32_t viommu_id; /* virtual IOMMU ID of allocated object */
} IOMMUFDViommu;
+/*
+ * Virtual device object for a physical device bind to a vIOMMU.
+ */
+typedef struct IOMMUFDVdev {
+ uint32_t vdevice_id; /* object handle for vDevice */
+ uint32_t virt_id; /* virtual device ID */
+} IOMMUFDVdev;
+
bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
void iommufd_backend_disconnect(IOMMUFDBackend *be);
@@ -73,6 +81,10 @@ bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
uint32_t viommu_type, uint32_t hwpt_id,
uint32_t *out_hwpt, Error **errp);
+bool iommufd_backend_alloc_vdev(IOMMUFDBackend *be, uint32_t dev_id,
+ uint32_t viommu_id, uint64_t virt_id,
+ uint32_t *out_vdev_id, Error **errp);
+
bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id,
bool start, Error **errp);
bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id,
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 02/37] backends/iommufd: Introduce iommufd_backend_alloc_vdev
2026-01-26 10:42 ` [PATCH v9 02/37] backends/iommufd: Introduce iommufd_backend_alloc_vdev Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju
On 2026/1/26 18:42, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> Add a helper to allocate an iommufd device's virtual device (in the user
> space) per a viommu instance.
>
> While at it, introduce a struct IOMMUFDVdev for later use by vendor
> IOMMU implementations.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> backends/iommufd.c | 27 +++++++++++++++++++++++++++
> backends/trace-events | 1 +
> include/system/iommufd.h | 12 ++++++++++++
> 3 files changed, 40 insertions(+)
>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index c65dc41d71..e3a3c1480e 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -473,6 +473,33 @@ bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
> return true;
> }
>
> +bool iommufd_backend_alloc_vdev(IOMMUFDBackend *be, uint32_t dev_id,
> + uint32_t viommu_id, uint64_t virt_id,
> + uint32_t *out_vdev_id, Error **errp)
> +{
> + int ret;
> + struct iommu_vdevice_alloc alloc_vdev = {
> + .size = sizeof(alloc_vdev),
> + .viommu_id = viommu_id,
> + .dev_id = dev_id,
> + .virt_id = virt_id,
> + };
> +
> + ret = ioctl(be->fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev);
> +
> + trace_iommufd_backend_alloc_vdev(be->fd, dev_id, viommu_id, virt_id,
> + alloc_vdev.out_vdevice_id, ret);
> +
> + if (ret) {
> + error_setg_errno(errp, errno, "IOMMU_VDEVICE_ALLOC failed");
> + return false;
> + }
> +
> + g_assert(out_vdev_id);
> + *out_vdev_id = alloc_vdev.out_vdevice_id;
> + return true;
> +}
> +
> bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> uint32_t hwpt_id, Error **errp)
> {
> diff --git a/backends/trace-events b/backends/trace-events
> index 1ae94c4290..14a7ecf5aa 100644
> --- a/backends/trace-events
> +++ b/backends/trace-events
> @@ -22,3 +22,4 @@ iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) "
> iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t flags, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" flags=0x%"PRIx64" page_size=0x%"PRIx64" (%d)"
> iommufd_backend_invalidate_cache(int iommufd, uint32_t id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
> iommufd_backend_alloc_viommu(int iommufd, uint32_t dev_id, uint32_t type, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)"
> +iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)"
> diff --git a/include/system/iommufd.h b/include/system/iommufd.h
> index bc4acd1e8b..567dfb7b1d 100644
> --- a/include/system/iommufd.h
> +++ b/include/system/iommufd.h
> @@ -48,6 +48,14 @@ typedef struct IOMMUFDViommu {
> uint32_t viommu_id; /* virtual IOMMU ID of allocated object */
> } IOMMUFDViommu;
>
> +/*
> + * Virtual device object for a physical device bind to a vIOMMU.
> + */
> +typedef struct IOMMUFDVdev {
> + uint32_t vdevice_id; /* object handle for vDevice */
> + uint32_t virt_id; /* virtual device ID */
> +} IOMMUFDVdev;
> +
> bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
> void iommufd_backend_disconnect(IOMMUFDBackend *be);
>
> @@ -73,6 +81,10 @@ bool iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id,
> uint32_t viommu_type, uint32_t hwpt_id,
> uint32_t *out_hwpt, Error **errp);
>
> +bool iommufd_backend_alloc_vdev(IOMMUFDBackend *be, uint32_t dev_id,
> + uint32_t viommu_id, uint64_t virt_id,
> + uint32_t *out_vdev_id, Error **errp);
> +
> bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id,
> bool start, Error **errp);
> bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id,
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 03/37] hw/arm/smmu-common: Factor out common helper functions and export
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
2026-01-26 10:42 ` [PATCH v9 01/37] backends/iommufd: Introduce iommufd_backend_alloc_viommu Shameer Kolothum
2026-01-26 10:42 ` [PATCH v9 02/37] backends/iommufd: Introduce iommufd_backend_alloc_vdev Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 04/37] hw/arm/smmu-common: Make iommu ops part of SMMUState Shameer Kolothum
` (34 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Factor out common helper functions and export. Subsequent patches for
smmuv3 accel support will make use of this.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmu-common.c | 44 +++++++++++++++++++++---------------
include/hw/arm/smmu-common.h | 6 +++++
2 files changed, 32 insertions(+), 18 deletions(-)
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index cdcfb1343d..1492d7dd95 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -847,12 +847,24 @@ SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num)
return NULL;
}
-static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
+void smmu_init_sdev(SMMUState *s, SMMUDevice *sdev, PCIBus *bus, int devfn)
{
- SMMUState *s = opaque;
- SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus);
- SMMUDevice *sdev;
static unsigned int index;
+ g_autofree char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn,
+ index++);
+ sdev->smmu = s;
+ sdev->bus = bus;
+ sdev->devfn = devfn;
+
+ memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu),
+ s->mrtypename, OBJECT(s), name, UINT64_MAX);
+ address_space_init(&sdev->as, MEMORY_REGION(&sdev->iommu), name);
+ trace_smmu_add_mr(name);
+}
+
+SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus)
+{
+ SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus);
if (!sbus) {
sbus = g_malloc0(sizeof(SMMUPciBus) +
@@ -861,23 +873,19 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
g_hash_table_insert(s->smmu_pcibus_by_busptr, bus, sbus);
}
+ return sbus;
+}
+
+static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
+{
+ SMMUState *s = opaque;
+ SMMUPciBus *sbus = smmu_get_sbus(s, bus);
+ SMMUDevice *sdev;
+
sdev = sbus->pbdev[devfn];
if (!sdev) {
- char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn, index++);
-
sdev = sbus->pbdev[devfn] = g_new0(SMMUDevice, 1);
-
- sdev->smmu = s;
- sdev->bus = bus;
- sdev->devfn = devfn;
-
- memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu),
- s->mrtypename,
- OBJECT(s), name, UINT64_MAX);
- address_space_init(&sdev->as,
- MEMORY_REGION(&sdev->iommu), name);
- trace_smmu_add_mr(name);
- g_free(name);
+ smmu_init_sdev(s, sdev, bus, devfn);
}
return &sdev->as;
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index b49b2f27fa..f5060cf36f 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -184,6 +184,12 @@ OBJECT_DECLARE_TYPE(SMMUState, SMMUBaseClass, ARM_SMMU)
/* Return the SMMUPciBus handle associated to a PCI bus number */
SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num);
+/* Return the SMMUPciBus handle associated to a PCI bus */
+SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus);
+
+/* Initialize SMMUDevice handle associated to a SMMUPciBus */
+void smmu_init_sdev(SMMUState *s, SMMUDevice *sdev, PCIBus *bus, int devfn);
+
/* Return the stream ID of an SMMU device */
static inline uint16_t smmu_get_sid(SMMUDevice *sdev)
{
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 04/37] hw/arm/smmu-common: Make iommu ops part of SMMUState
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (2 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 03/37] hw/arm/smmu-common: Factor out common helper functions and export Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 05/37] hw/arm/smmuv3-accel: Introduce smmuv3 accel device Shameer Kolothum
` (33 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Make iommu ops part of SMMUState and set to the current default smmu_ops.
No functional change intended. This will allow SMMUv3 accel implementation
to set a different iommu ops later.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmu-common.c | 7 +++++--
include/hw/arm/smmu-common.h | 1 +
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 1492d7dd95..58c4452b1f 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -959,6 +959,9 @@ static void smmu_base_realize(DeviceState *dev, Error **errp)
"smmu-secure-memory-view");
}
+ if (!s->iommu_ops) {
+ s->iommu_ops = &smmu_ops;
+ }
/*
* We only allow default PCIe Root Complex(pcie.0) or pxb-pcie based extra
* root complexes to be associated with SMMU.
@@ -978,9 +981,9 @@ static void smmu_base_realize(DeviceState *dev, Error **errp)
}
if (s->smmu_per_bus) {
- pci_setup_iommu_per_bus(pci_bus, &smmu_ops, s);
+ pci_setup_iommu_per_bus(pci_bus, s->iommu_ops, s);
} else {
- pci_setup_iommu(pci_bus, &smmu_ops, s);
+ pci_setup_iommu(pci_bus, s->iommu_ops, s);
}
return;
}
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index f5060cf36f..7b975abc25 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -166,6 +166,7 @@ struct SMMUState {
AddressSpace memory_as;
MemoryRegion *secure_memory;
AddressSpace secure_memory_as;
+ const PCIIOMMUOps *iommu_ops;
};
struct SMMUBaseClass {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 05/37] hw/arm/smmuv3-accel: Introduce smmuv3 accel device
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (3 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 04/37] hw/arm/smmu-common: Make iommu ops part of SMMUState Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 06/37] hw/arm/smmuv3-accel: Initialize shared system address space Shameer Kolothum
` (32 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Set up dedicated PCIIOMMUOps for the accel SMMUv3, since it will need
different callback handling in upcoming patches. This also adds a
CONFIG_ARM_SMMUV3_ACCEL build option so the feature can be disabled
at compile time. Because we now include CONFIG_DEVICES in the header to
check for ARM_SMMUV3_ACCEL, the meson file entry for smmuv3.c needs to
be changed to arm_ss.add.
The “accel” property isn’t user visible yet and it will be introduced in
a later patch once all the supporting pieces are ready.
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/Kconfig | 5 ++++
hw/arm/meson.build | 3 ++-
hw/arm/smmuv3-accel.c | 59 +++++++++++++++++++++++++++++++++++++++++
hw/arm/smmuv3-accel.h | 27 +++++++++++++++++++
hw/arm/smmuv3.c | 5 ++++
include/hw/arm/smmuv3.h | 3 +++
6 files changed, 101 insertions(+), 1 deletion(-)
create mode 100644 hw/arm/smmuv3-accel.c
create mode 100644 hw/arm/smmuv3-accel.h
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 97d747e206..c66c452737 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -626,8 +626,13 @@ config FSL_IMX8MP_EVK
depends on TCG
select FSL_IMX8MP
+config ARM_SMMUV3_ACCEL
+ bool
+ depends on ARM_SMMUV3
+
config ARM_SMMUV3
bool
+ select ARM_SMMUV3_ACCEL if IOMMUFD
config FSL_IMX6UL
bool
diff --git a/hw/arm/meson.build b/hw/arm/meson.build
index aeaf654790..c250487e64 100644
--- a/hw/arm/meson.build
+++ b/hw/arm/meson.build
@@ -84,7 +84,8 @@ arm_common_ss.add(when: 'CONFIG_ARMSSE', if_true: files('armsse.c'))
arm_common_ss.add(when: 'CONFIG_FSL_IMX7', if_true: files('fsl-imx7.c', 'mcimx7d-sabre.c'))
arm_common_ss.add(when: 'CONFIG_FSL_IMX8MP', if_true: files('fsl-imx8mp.c'))
arm_common_ss.add(when: 'CONFIG_FSL_IMX8MP_EVK', if_true: files('imx8mp-evk.c'))
-arm_common_ss.add(when: 'CONFIG_ARM_SMMUV3', if_true: files('smmuv3.c'))
+arm_ss.add(when: 'CONFIG_ARM_SMMUV3', if_true: files('smmuv3.c'))
+arm_ss.add(when: 'CONFIG_ARM_SMMUV3_ACCEL', if_true: files('smmuv3-accel.c'))
arm_common_ss.add(when: 'CONFIG_FSL_IMX6UL', if_true: files('fsl-imx6ul.c', 'mcimx6ul-evk.c'))
arm_common_ss.add(when: 'CONFIG_NRF51_SOC', if_true: files('nrf51_soc.c'))
arm_common_ss.add(when: 'CONFIG_XEN', if_true: files(
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
new file mode 100644
index 0000000000..99ef0db8c4
--- /dev/null
+++ b/hw/arm/smmuv3-accel.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd
+ * Copyright (C) 2025 NVIDIA
+ * Written by Nicolin Chen, Shameer Kolothum
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/arm/smmuv3.h"
+#include "smmuv3-accel.h"
+
+static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
+ PCIBus *bus, int devfn)
+{
+ SMMUDevice *sdev = sbus->pbdev[devfn];
+ SMMUv3AccelDevice *accel_dev;
+
+ if (sdev) {
+ return container_of(sdev, SMMUv3AccelDevice, sdev);
+ }
+
+ accel_dev = g_new0(SMMUv3AccelDevice, 1);
+ sdev = &accel_dev->sdev;
+
+ sbus->pbdev[devfn] = sdev;
+ smmu_init_sdev(bs, sdev, bus, devfn);
+ return accel_dev;
+}
+
+/*
+ * Find or add an address space for the given PCI device.
+ *
+ * If a device matching @bus and @devfn already exists, return its
+ * corresponding address space. Otherwise, create a new device entry
+ * and initialize address space for it.
+ */
+static AddressSpace *smmuv3_accel_find_add_as(PCIBus *bus, void *opaque,
+ int devfn)
+{
+ SMMUState *bs = opaque;
+ SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
+ SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
+ SMMUDevice *sdev = &accel_dev->sdev;
+
+ return &sdev->as;
+}
+
+static const PCIIOMMUOps smmuv3_accel_ops = {
+ .get_address_space = smmuv3_accel_find_add_as,
+};
+
+void smmuv3_accel_init(SMMUv3State *s)
+{
+ SMMUState *bs = ARM_SMMU(s);
+
+ bs->iommu_ops = &smmuv3_accel_ops;
+}
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
new file mode 100644
index 0000000000..0dc6b00d35
--- /dev/null
+++ b/hw/arm/smmuv3-accel.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd
+ * Copyright (C) 2025 NVIDIA
+ * Written by Nicolin Chen, Shameer Kolothum
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_ARM_SMMUV3_ACCEL_H
+#define HW_ARM_SMMUV3_ACCEL_H
+
+#include "hw/arm/smmu-common.h"
+#include CONFIG_DEVICES
+
+typedef struct SMMUv3AccelDevice {
+ SMMUDevice sdev;
+} SMMUv3AccelDevice;
+
+#ifdef CONFIG_ARM_SMMUV3_ACCEL
+void smmuv3_accel_init(SMMUv3State *s);
+#else
+static inline void smmuv3_accel_init(SMMUv3State *s)
+{
+}
+#endif
+
+#endif /* HW_ARM_SMMUV3_ACCEL_H */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 985dfb345f..95d44f81ed 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -32,6 +32,7 @@
#include "qapi/error.h"
#include "hw/arm/smmuv3.h"
+#include "smmuv3-accel.h"
#include "smmuv3-internal.h"
#include "smmu-internal.h"
@@ -1882,6 +1883,10 @@ static void smmu_realize(DeviceState *d, Error **errp)
SysBusDevice *dev = SYS_BUS_DEVICE(d);
Error *local_err = NULL;
+ if (s->accel) {
+ smmuv3_accel_init(s);
+ }
+
c->parent_realize(d, &local_err);
if (local_err) {
error_propagate(errp, local_err);
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index d183a62766..bb7076286b 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -63,6 +63,9 @@ struct SMMUv3State {
qemu_irq irq[4];
QemuMutex mutex;
char *stage;
+
+ /* SMMU has HW accelerator support for nested S1 + s2 */
+ bool accel;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 06/37] hw/arm/smmuv3-accel: Initialize shared system address space
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (4 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 05/37] hw/arm/smmuv3-accel: Introduce smmuv3 accel device Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 07/37] hw/pci/pci: Move pci_init_bus_master() after adding device to bus Shameer Kolothum
` (31 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
To support accelerated SMMUv3 instances, introduce a shared system-wide
AddressSpace (shared_as_sysmem) that aliases the global system memory.
This shared AddressSpace will be used in a subsequent patch for all
vfio-pci devices behind all accelerated SMMUv3 instances within a VM.
Sharing a single system AddressSpace ensures that all devices behind
accelerated SMMUv3s use the same system address space pointer. This
allows VFIO/iommufd to reuse a single IOAS ID in iommufd_cdev_attach(),
enabling the Stage-2 page tables to be shared within the VM rather than
duplicated for each SMMUv3 instance.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 99ef0db8c4..b2eded743e 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -11,6 +11,14 @@
#include "hw/arm/smmuv3.h"
#include "smmuv3-accel.h"
+/*
+ * The root region aliases the global system memory, and shared_as_sysmem
+ * provides a shared Address Space referencing it. This Address Space is used
+ * by all vfio-pci devices behind all accelerated SMMUv3 instances within a VM.
+ */
+static MemoryRegion root, sysmem;
+static AddressSpace *shared_as_sysmem;
+
static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
PCIBus *bus, int devfn)
{
@@ -51,9 +59,27 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.get_address_space = smmuv3_accel_find_add_as,
};
+static void smmuv3_accel_as_init(SMMUv3State *s)
+{
+
+ if (shared_as_sysmem) {
+ return;
+ }
+
+ memory_region_init(&root, OBJECT(s), "root", UINT64_MAX);
+ memory_region_init_alias(&sysmem, OBJECT(s), "smmuv3-accel-sysmem",
+ get_system_memory(), 0,
+ memory_region_size(get_system_memory()));
+ memory_region_add_subregion(&root, 0, &sysmem);
+
+ shared_as_sysmem = g_new0(AddressSpace, 1);
+ address_space_init(shared_as_sysmem, &root, "smmuv3-accel-as-sysmem");
+}
+
void smmuv3_accel_init(SMMUv3State *s)
{
SMMUState *bs = ARM_SMMU(s);
bs->iommu_ops = &smmuv3_accel_ops;
+ smmuv3_accel_as_init(s);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 07/37] hw/pci/pci: Move pci_init_bus_master() after adding device to bus
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (5 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 06/37] hw/arm/smmuv3-accel: Initialize shared system address space Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 08/37] hw/pci/pci: Add optional supports_address_space() callback Shameer Kolothum
` (30 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Michael S . Tsirkin
During PCI hotplug, in do_pci_register_device(), pci_init_bus_master()
is called before storing the pci_dev pointer in bus->devices[devfn].
This causes a problem if pci_init_bus_master() (via its
get_address_space() callback) attempts to retrieve the device using
pci_find_device(), since the PCI device is not yet visible on the bus.
Fix this by moving the pci_init_bus_master() call to after the device
has been added to bus->devices[devfn].
This prepares for a subsequent patch where the accel SMMUv3
get_address_space() callback retrieves the pci_dev to identify the
attached device type.
No functional change intended.
Cc: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci/pci.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 8cbf5f5d70..229ea7cfb1 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1381,9 +1381,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
pci_dev->bus_master_as.max_bounce_buffer_size =
pci_dev->max_bounce_buffer_size;
- if (phase_check(PHASE_MACHINE_READY)) {
- pci_init_bus_master(pci_dev);
- }
pci_dev->irq_state = 0;
pci_config_alloc(pci_dev);
@@ -1427,6 +1424,9 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
pci_dev->config_write = config_write;
bus->devices[devfn] = pci_dev;
pci_dev->version_id = 2; /* Current pci device vmstate version */
+ if (phase_check(PHASE_MACHINE_READY)) {
+ pci_init_bus_master(pci_dev);
+ }
return pci_dev;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 07/37] hw/pci/pci: Move pci_init_bus_master() after adding device to bus
2026-01-26 10:43 ` [PATCH v9 07/37] hw/pci/pci: Move pci_init_bus_master() after adding device to bus Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju,
Michael S . Tsirkin
On 2026/1/26 18:43, Shameer Kolothum wrote:
> During PCI hotplug, in do_pci_register_device(), pci_init_bus_master()
> is called before storing the pci_dev pointer in bus->devices[devfn].
>
> This causes a problem if pci_init_bus_master() (via its
> get_address_space() callback) attempts to retrieve the device using
> pci_find_device(), since the PCI device is not yet visible on the bus.
>
> Fix this by moving the pci_init_bus_master() call to after the device
> has been added to bus->devices[devfn].
>
> This prepares for a subsequent patch where the accel SMMUv3
> get_address_space() callback retrieves the pci_dev to identify the
> attached device type.
>
> No functional change intended.
>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/pci/pci.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 8cbf5f5d70..229ea7cfb1 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1381,9 +1381,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
> pci_dev->bus_master_as.max_bounce_buffer_size =
> pci_dev->max_bounce_buffer_size;
>
> - if (phase_check(PHASE_MACHINE_READY)) {
> - pci_init_bus_master(pci_dev);
> - }
> pci_dev->irq_state = 0;
> pci_config_alloc(pci_dev);
>
> @@ -1427,6 +1424,9 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
> pci_dev->config_write = config_write;
> bus->devices[devfn] = pci_dev;
> pci_dev->version_id = 2; /* Current pci device vmstate version */
> + if (phase_check(PHASE_MACHINE_READY)) {
> + pci_init_bus_master(pci_dev);
> + }
> return pci_dev;
> }
>
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 08/37] hw/pci/pci: Add optional supports_address_space() callback
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (6 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 07/37] hw/pci/pci: Move pci_init_bus_master() after adding device to bus Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 09/37] hw/pci-bridge/pci_expander_bridge: Move TYPE_PXB_PCIE_DEV to header Shameer Kolothum
` (29 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Michael S . Tsirkin
Introduce an optional supports_address_space() callback in PCIIOMMUOps to
allow a vIOMMU implementation to reject devices that should not be attached
to it.
Currently, get_address_space() is the first and mandatory callback into the
vIOMMU layer, which always returns an address space. For certain setups, such
as hardware accelerated vIOMMUs (e.g. ARM SMMUv3 with accel=on), attaching
emulated endpoint devices is undesirable as it may impact the behavior or
performance of VFIO passthrough devices, for example, by triggering
unnecessary invalidations on the host IOMMU.
The new callback allows a vIOMMU to check and reject unsupported devices
early during PCI device registration.
Cc: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci/pci.c | 20 ++++++++++++++++++++
include/hw/pci/pci.h | 19 +++++++++++++++++++
2 files changed, 39 insertions(+)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 229ea7cfb1..101e745bd5 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -135,6 +135,21 @@ static void pci_set_master(PCIDevice *d, bool enable)
d->is_master = enable; /* cache the status */
}
+static bool
+pci_device_supports_iommu_address_space(PCIDevice *dev, Error **errp)
+{
+ PCIBus *bus;
+ PCIBus *iommu_bus;
+ int devfn;
+
+ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+ if (iommu_bus && iommu_bus->iommu_ops->supports_address_space) {
+ return iommu_bus->iommu_ops->supports_address_space(bus,
+ iommu_bus->iommu_opaque, devfn, errp);
+ }
+ return true;
+}
+
static void pci_init_bus_master(PCIDevice *pci_dev)
{
AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
@@ -1424,6 +1439,11 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
pci_dev->config_write = config_write;
bus->devices[devfn] = pci_dev;
pci_dev->version_id = 2; /* Current pci device vmstate version */
+ if (!pci_device_supports_iommu_address_space(pci_dev, errp)) {
+ do_pci_unregister_device(pci_dev);
+ bus->devices[devfn] = NULL;
+ return NULL;
+ }
if (phase_check(PHASE_MACHINE_READY)) {
pci_init_bus_master(pci_dev);
}
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 6fd8984c99..ddb0c98e9f 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -417,6 +417,25 @@ typedef struct IOMMUPRINotifier {
* framework for a set of devices on a PCI bus.
*/
typedef struct PCIIOMMUOps {
+ /**
+ * @supports_address_space: Optional pre-check to determine whether a PCI
+ * device can be associated with an IOMMU. If this callback returns true,
+ * the IOMMU accepts the device association and get_address_space() can be
+ * called to obtain the address_space to be used.
+ *
+ * @bus: the #PCIBus being accessed.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ * Returns: true if the device can be associated with an IOMMU, false
+ * otherwise with errp set.
+ */
+ bool (*supports_address_space)(PCIBus *bus, void *opaque, int devfn,
+ Error **errp);
/**
* @get_address_space: get the address space for a set of devices
* on a PCI bus.
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 08/37] hw/pci/pci: Add optional supports_address_space() callback
2026-01-26 10:43 ` [PATCH v9 08/37] hw/pci/pci: Add optional supports_address_space() callback Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju,
Michael S . Tsirkin
On 2026/1/26 18:43, Shameer Kolothum wrote:
> Introduce an optional supports_address_space() callback in PCIIOMMUOps to
> allow a vIOMMU implementation to reject devices that should not be attached
> to it.
>
> Currently, get_address_space() is the first and mandatory callback into the
> vIOMMU layer, which always returns an address space. For certain setups, such
> as hardware accelerated vIOMMUs (e.g. ARM SMMUv3 with accel=on), attaching
> emulated endpoint devices is undesirable as it may impact the behavior or
> performance of VFIO passthrough devices, for example, by triggering
> unnecessary invalidations on the host IOMMU.
>
> The new callback allows a vIOMMU to check and reject unsupported devices
> early during PCI device registration.
>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/pci/pci.c | 20 ++++++++++++++++++++
> include/hw/pci/pci.h | 19 +++++++++++++++++++
> 2 files changed, 39 insertions(+)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 229ea7cfb1..101e745bd5 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -135,6 +135,21 @@ static void pci_set_master(PCIDevice *d, bool enable)
> d->is_master = enable; /* cache the status */
> }
>
> +static bool
> +pci_device_supports_iommu_address_space(PCIDevice *dev, Error **errp)
> +{
> + PCIBus *bus;
> + PCIBus *iommu_bus;
> + int devfn;
> +
> + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
> + if (iommu_bus && iommu_bus->iommu_ops->supports_address_space) {
> + return iommu_bus->iommu_ops->supports_address_space(bus,
> + iommu_bus->iommu_opaque, devfn, errp);
> + }
> + return true;
> +}
> +
> static void pci_init_bus_master(PCIDevice *pci_dev)
> {
> AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
> @@ -1424,6 +1439,11 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
> pci_dev->config_write = config_write;
> bus->devices[devfn] = pci_dev;
> pci_dev->version_id = 2; /* Current pci device vmstate version */
> + if (!pci_device_supports_iommu_address_space(pci_dev, errp)) {
> + do_pci_unregister_device(pci_dev);
> + bus->devices[devfn] = NULL;
> + return NULL;
> + }
> if (phase_check(PHASE_MACHINE_READY)) {
> pci_init_bus_master(pci_dev);
> }
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 6fd8984c99..ddb0c98e9f 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -417,6 +417,25 @@ typedef struct IOMMUPRINotifier {
> * framework for a set of devices on a PCI bus.
> */
> typedef struct PCIIOMMUOps {
> + /**
> + * @supports_address_space: Optional pre-check to determine whether a PCI
> + * device can be associated with an IOMMU. If this callback returns true,
> + * the IOMMU accepts the device association and get_address_space() can be
> + * called to obtain the address_space to be used.
> + *
> + * @bus: the #PCIBus being accessed.
> + *
> + * @opaque: the data passed to pci_setup_iommu().
> + *
> + * @devfn: device and function number.
> + *
> + * @errp: pass an Error out only when return false
> + *
> + * Returns: true if the device can be associated with an IOMMU, false
> + * otherwise with errp set.
> + */
> + bool (*supports_address_space)(PCIBus *bus, void *opaque, int devfn,
> + Error **errp);
> /**
> * @get_address_space: get the address space for a set of devices
> * on a PCI bus.
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 09/37] hw/pci-bridge/pci_expander_bridge: Move TYPE_PXB_PCIE_DEV to header
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (7 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 08/37] hw/pci/pci: Add optional supports_address_space() callback Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 10/37] hw/arm/smmuv3-accel: Restrict accelerated SMMUv3 to vfio-pci endpoints with iommufd Shameer Kolothum
` (28 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Move the TYPE_PXB_PCIE_DEV definition to header so that it can be
referenced by other code in subsequent patch.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci-bridge/pci_expander_bridge.c | 1 -
include/hw/pci/pci_bridge.h | 1 +
2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
index 08d40aa2ea..b6e2eb7969 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -48,7 +48,6 @@ struct PXBBus {
char bus_path[8];
};
-#define TYPE_PXB_PCIE_DEV "pxb-pcie"
OBJECT_DECLARE_SIMPLE_TYPE(PXBPCIEDev, PXB_PCIE_DEV)
static GList *pxb_dev_list;
diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h
index a055fd8d32..b61360b900 100644
--- a/include/hw/pci/pci_bridge.h
+++ b/include/hw/pci/pci_bridge.h
@@ -106,6 +106,7 @@ typedef struct PXBPCIEDev {
#define TYPE_PXB_PCIE_BUS "pxb-pcie-bus"
#define TYPE_PXB_CXL_BUS "pxb-cxl-bus"
+#define TYPE_PXB_PCIE_DEV "pxb-pcie"
#define TYPE_PXB_DEV "pxb"
OBJECT_DECLARE_SIMPLE_TYPE(PXBDev, PXB_DEV)
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 09/37] hw/pci-bridge/pci_expander_bridge: Move TYPE_PXB_PCIE_DEV to header
2026-01-26 10:43 ` [PATCH v9 09/37] hw/pci-bridge/pci_expander_bridge: Move TYPE_PXB_PCIE_DEV to header Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju
On 2026/1/26 18:43, Shameer Kolothum wrote:
> Move the TYPE_PXB_PCIE_DEV definition to header so that it can be
> referenced by other code in subsequent patch.
>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/pci-bridge/pci_expander_bridge.c | 1 -
> include/hw/pci/pci_bridge.h | 1 +
> 2 files changed, 1 insertion(+), 1 deletion(-)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
> index 08d40aa2ea..b6e2eb7969 100644
> --- a/hw/pci-bridge/pci_expander_bridge.c
> +++ b/hw/pci-bridge/pci_expander_bridge.c
> @@ -48,7 +48,6 @@ struct PXBBus {
> char bus_path[8];
> };
>
> -#define TYPE_PXB_PCIE_DEV "pxb-pcie"
> OBJECT_DECLARE_SIMPLE_TYPE(PXBPCIEDev, PXB_PCIE_DEV)
>
> static GList *pxb_dev_list;
> diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h
> index a055fd8d32..b61360b900 100644
> --- a/include/hw/pci/pci_bridge.h
> +++ b/include/hw/pci/pci_bridge.h
> @@ -106,6 +106,7 @@ typedef struct PXBPCIEDev {
>
> #define TYPE_PXB_PCIE_BUS "pxb-pcie-bus"
> #define TYPE_PXB_CXL_BUS "pxb-cxl-bus"
> +#define TYPE_PXB_PCIE_DEV "pxb-pcie"
> #define TYPE_PXB_DEV "pxb"
> OBJECT_DECLARE_SIMPLE_TYPE(PXBDev, PXB_DEV)
>
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 10/37] hw/arm/smmuv3-accel: Restrict accelerated SMMUv3 to vfio-pci endpoints with iommufd
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (8 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 09/37] hw/pci-bridge/pci_expander_bridge: Move TYPE_PXB_PCIE_DEV to header Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 11/37] hw/arm/smmuv3: Implement get_viommu_cap() callback Shameer Kolothum
` (27 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Accelerated SMMUv3 is only meaningful when a device can leverage the host
SMMUv3 in nested mode (S1+S2 translation). To keep the model consistent
and correct, this mode is restricted to vfio-pci endpoint devices using
the iommufd backend.
Non-endpoint emulated devices such as PCIe root ports and bridges are also
permitted so that vfio-pci devices can be attached downstream. All other
device types are unsupported in accelerated mode.
Implement supports_address_space() callback to reject all such unsupported
devices.
This restriction also avoids complications with IOTLB invalidations. Some
TLBI commands (e.g. CMD_TLBI_NH_ASID) lack an associated SID, making it
difficult to trace the originating device. Allowing emulated endpoints
would require invalidating both QEMU’s software IOTLB and the host’s
hardware IOTLB, which can significantly degrade performance.
A key design choice is the address space returned for accelerated vfio-pci
endpoints. VFIO core has a container that manages an HWPT. By default, it
allocates a stage-1 normal HWPT, unless vIOMMU requests for a nesting
parent HWPT for accelerated cases.
VFIO core adds a listener for that HWPT and sets up a handler
vfio_container_region_add() where it checks the memory region.
-If the region is a non-IOMMU translated one (system address space), VFIO
treats it as RAM and handles all stage-2 mappings for the core allocated
nesting parent HWPT.
-If the region is an IOMMU address space, VFIO instead enables IOTLB
notifier handling and translation replay, skipping the RAM listener and
therefore not installing stage-2 mappings.
For accelerated SMMUv3, correct operation requires the S1+S2 nesting
model, and therefore VFIO must take the "system address space" path so
that stage-2 mappings are properly built. Returning an alias of the
system address space ensures this happens. Returning the IOMMU address
space would omit stage-2 mapping and break nested translation.
Another option considered was forcing a pre-registration path using
vfio_prereg_listener() to set up stage-2 mappings, but this requires
changes in VFIO core and was not adopted. Returning an alias of the
system address space keeps the design aligned with existing VFIO/iommufd
nesting flows and avoids the need for cross-subsystem changes.
In summary:
- vfio-pci devices(with iommufd as backend) return an address space
aliased to system address space.
- bridges and root ports return the IOMMU address space.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 77 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 76 insertions(+), 1 deletion(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index b2eded743e..2fcd301322 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -7,8 +7,13 @@
*/
#include "qemu/osdep.h"
+#include "qemu/error-report.h"
#include "hw/arm/smmuv3.h"
+#include "hw/pci/pci_bridge.h"
+#include "hw/pci-host/gpex.h"
+#include "hw/vfio/pci.h"
+
#include "smmuv3-accel.h"
/*
@@ -37,6 +42,48 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
return accel_dev;
}
+/*
+ * Only allow PCIe bridges, pxb-pcie roots, and GPEX roots so vfio-pci
+ * endpoints can sit downstream. Accelerated SMMUv3 requires a vfio-pci
+ * endpoint using the iommufd backend; all other device types are rejected.
+ * This avoids supporting emulated endpoints, which would complicate IOTLB
+ * invalidation and hurt performance.
+ */
+static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci)
+{
+
+ if (object_dynamic_cast(OBJECT(pdev), TYPE_PCI_BRIDGE) ||
+ object_dynamic_cast(OBJECT(pdev), TYPE_PXB_PCIE_DEV) ||
+ object_dynamic_cast(OBJECT(pdev), TYPE_GPEX_ROOT_DEVICE)) {
+ return true;
+ } else if ((object_dynamic_cast(OBJECT(pdev), TYPE_VFIO_PCI))) {
+ *vfio_pci = true;
+ if (object_property_get_link(OBJECT(pdev), "iommufd", NULL)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool smmuv3_accel_supports_as(PCIBus *bus, void *opaque, int devfn,
+ Error **errp)
+{
+ PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+ bool vfio_pci = false;
+
+ if (pdev && !smmuv3_accel_pdev_allowed(pdev, &vfio_pci)) {
+ if (vfio_pci) {
+ error_setg(errp, "vfio-pci endpoint devices without an iommufd "
+ "backend not allowed when using arm-smmuv3,accel=on");
+
+ } else {
+ error_setg(errp, "Emulated endpoint devices are not allowed when "
+ "using arm-smmuv3,accel=on");
+ }
+ return false;
+ }
+ return true;
+}
/*
* Find or add an address space for the given PCI device.
*
@@ -47,15 +94,43 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
static AddressSpace *smmuv3_accel_find_add_as(PCIBus *bus, void *opaque,
int devfn)
{
+ PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
SMMUState *bs = opaque;
SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
SMMUDevice *sdev = &accel_dev->sdev;
+ bool vfio_pci = false;
- return &sdev->as;
+ if (pdev && !smmuv3_accel_pdev_allowed(pdev, &vfio_pci)) {
+ /* Should never be here: supports_address_space() filters these out */
+ g_assert_not_reached();
+ }
+
+ /*
+ * In the accelerated mode, a vfio-pci device attached via the iommufd
+ * backend must remain in the system address space. Such a device is
+ * always translated by its physical SMMU (using either a stage-2-only
+ * STE or a nested STE), where the parent stage-2 page table is allocated
+ * by the VFIO core to back the system address space.
+ *
+ * Return the shared_as_sysmem aliased to the global system memory in this
+ * case. Sharing address_space_memory also allows devices under different
+ * vSMMU instances in the same VM to reuse a single nesting parent HWPT in
+ * the VFIO core.
+ *
+ * For non-endpoint emulated devices such as PCIe root ports and bridges,
+ * which may use the normal emulated translation path and software IOTLBs,
+ * return the SMMU's IOMMU address space.
+ */
+ if (vfio_pci) {
+ return shared_as_sysmem;
+ } else {
+ return &sdev->as;
+ }
}
static const PCIIOMMUOps smmuv3_accel_ops = {
+ .supports_address_space = smmuv3_accel_supports_as,
.get_address_space = smmuv3_accel_find_add_as,
};
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 11/37] hw/arm/smmuv3: Implement get_viommu_cap() callback
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (9 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 10/37] hw/arm/smmuv3-accel: Restrict accelerated SMMUv3 to vfio-pci endpoints with iommufd Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback Shameer Kolothum
` (26 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
For accelerated SMMUv3, we need nested parent domain creation. Add the
callback support so that VFIO can create a nested parent.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 2fcd301322..be09cf8b73 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -10,6 +10,7 @@
#include "qemu/error-report.h"
#include "hw/arm/smmuv3.h"
+#include "hw/core/iommu.h"
#include "hw/pci/pci_bridge.h"
#include "hw/pci-host/gpex.h"
#include "hw/vfio/pci.h"
@@ -129,9 +130,21 @@ static AddressSpace *smmuv3_accel_find_add_as(PCIBus *bus, void *opaque,
}
}
+static uint64_t smmuv3_accel_get_viommu_flags(void *opaque)
+{
+ /*
+ * We return VIOMMU_FLAG_WANT_NESTING_PARENT to inform VFIO core to create a
+ * nesting parent which is required for accelerated SMMUv3 support.
+ * The real HW nested support should be reported from host SMMUv3 and if
+ * it doesn't, the nesting parent allocation will fail anyway in VFIO core.
+ */
+ return VIOMMU_FLAG_WANT_NESTING_PARENT;
+}
+
static const PCIIOMMUOps smmuv3_accel_ops = {
.supports_address_space = smmuv3_accel_supports_as,
.get_address_space = smmuv3_accel_find_add_as,
+ .get_viommu_flags = smmuv3_accel_get_viommu_flags,
};
static void smmuv3_accel_as_init(SMMUv3State *s)
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (10 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 11/37] hw/arm/smmuv3: Implement get_viommu_cap() callback Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-04-15 13:02 ` Anton Kuchin
2026-01-26 10:43 ` [PATCH v9 13/37] hw/arm/smmuv3: propagate smmuv3_cmdq_consume() errors to caller Shameer Kolothum
` (25 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
From: Nicolin Chen <nicolinc@nvidia.com>
Implement the VFIO/PCI callbacks to attach and detach a HostIOMMUDevice
to a vSMMUv3 when accel=on,
- set_iommu_device(): attach a HostIOMMUDevice to a vIOMMU
- unset_iommu_device(): detach and release associated resources
In SMMUv3 accel=on mode, the guest SMMUv3 is backed by the host SMMUv3 via
IOMMUFD. A vIOMMU object (created via IOMMU_VIOMMU_ALLOC) provides a per-VM,
security-isolated handle to the physical SMMUv3. Without a vIOMMU, the
vSMMUv3 cannot relay guest operations to the host hardware nor maintain
isolation across VMs or devices. Therefore, set_iommu_device() allocates
a vIOMMU object if one does not already exist.
There are two main points to consider in this implementation:
1) VFIO core allocates and attaches a S2 HWPT that acts as the nesting
parent for nested HWPTs(IOMMU_DOMAIN_NESTED). This parent HWPT will
be shared across multiple vSMMU instances within a VM.
2) A device cannot attach directly to a vIOMMU. Instead, it attaches
through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). Based on the STE
configuration,there are three types of nested HWPTs: bypass, abort,
and translate.
-The bypass and abort proxy HWPTs are pre-allocated. When SMMUv3
operates in global abort or bypass modes, as controlled by the GBPA
register, or issues a vSTE for bypass or abort we attach these
pre-allocated nested HWPTs.
-The translate HWPT requires a vDEVICE to be allocated first, since
invalidations and events depend on a valid vSID.
-The vDEVICE allocation and attach operations for vSTE based HWPTs
are implemented in subsequent patches.
In summary, a device placed behind a vSMMU instance must have a vSID for
translate vSTE. The bypass and abort vSTEs are pre-allocated as proxy
nested HWPTs and is attached based on GBPA register. The core-managed
nesting parent S2 HWPT is used as parent S2 HWPT for all the nested
HWPTs and is intended to be shared across vSMMU instances within the
same VM.
set_iommu_device():
- Reuse an existing vIOMMU for the same physical SMMU if available.
If not, allocate a new one using the nesting parent S2 HWPT.
- Pre-allocate two proxy nested HWPTs (bypass and abort) under the
vIOMMU and install one based on GBPA.ABORT value.
- Add the device to the vIOMMU’s device list.
unset_iommu_device():
- Re-attach device to the nesting parent S2 HWPT.
- Remove the device from the vIOMMU’s device list.
- If the list is empty, free the proxy HWPTs (bypass and abort)
and release the vIOMMU object.
Introduce struct SMMUv3AccelState, representing an accelerated SMMUv3
instance backed by an iommufd vIOMMU object, and storing the bypass and
abort proxy HWPT IDs.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 156 +++++++++++++++++++++++++++++++++
hw/arm/smmuv3-accel.h | 18 ++++
hw/arm/trace-events | 4 +
include/hw/arm/smmuv3-common.h | 3 +
include/hw/arm/smmuv3.h | 1 +
5 files changed, 182 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index be09cf8b73..9c2b917a11 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -8,6 +8,7 @@
#include "qemu/osdep.h"
#include "qemu/error-report.h"
+#include "trace.h"
#include "hw/arm/smmuv3.h"
#include "hw/core/iommu.h"
@@ -15,6 +16,7 @@
#include "hw/pci-host/gpex.h"
#include "hw/vfio/pci.h"
+#include "smmuv3-internal.h"
#include "smmuv3-accel.h"
/*
@@ -43,6 +45,157 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
return accel_dev;
}
+static uint32_t smmuv3_accel_gbpa_hwpt(SMMUv3State *s, SMMUv3AccelState *accel)
+{
+ return FIELD_EX32(s->gbpa, GBPA, ABORT) ?
+ accel->abort_hwpt_id : accel->bypass_hwpt_id;
+}
+
+static bool
+smmuv3_accel_alloc_viommu(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
+ Error **errp)
+{
+ SMMUv3AccelState *accel = s->s_accel;
+ struct iommu_hwpt_arm_smmuv3 bypass_data = {
+ .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
+ };
+ struct iommu_hwpt_arm_smmuv3 abort_data = {
+ .ste = { SMMU_STE_VALID, 0x0ULL },
+ };
+ uint32_t s2_hwpt_id = idev->hwpt_id;
+ uint32_t viommu_id, hwpt_id;
+ IOMMUFDViommu *viommu;
+
+ if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
+ IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
+ s2_hwpt_id, &viommu_id, errp)) {
+ return false;
+ }
+
+ viommu = g_new0(IOMMUFDViommu, 1);
+ viommu->viommu_id = viommu_id;
+ viommu->s2_hwpt_id = s2_hwpt_id;
+ viommu->iommufd = idev->iommufd;
+
+ /*
+ * Pre-allocate HWPTs for S1 bypass and abort cases. These will be attached
+ * later for guest STEs or GBPAs that require bypass or abort configuration.
+ */
+ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, viommu_id,
+ 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
+ sizeof(abort_data), &abort_data,
+ &accel->abort_hwpt_id, errp)) {
+ goto free_viommu;
+ }
+
+ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, viommu_id,
+ 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
+ sizeof(bypass_data), &bypass_data,
+ &accel->bypass_hwpt_id, errp)) {
+ goto free_abort_hwpt;
+ }
+
+ /* Attach a HWPT based on SMMUv3 GBPA.ABORT value */
+ hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
+ if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
+ goto free_bypass_hwpt;
+ }
+ accel->viommu = viommu;
+ return true;
+
+free_bypass_hwpt:
+ iommufd_backend_free_id(idev->iommufd, accel->bypass_hwpt_id);
+free_abort_hwpt:
+ iommufd_backend_free_id(idev->iommufd, accel->abort_hwpt_id);
+free_viommu:
+ iommufd_backend_free_id(idev->iommufd, viommu->viommu_id);
+ g_free(viommu);
+ return false;
+}
+
+static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+ HostIOMMUDevice *hiod, Error **errp)
+{
+ HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+ SMMUState *bs = opaque;
+ SMMUv3State *s = ARM_SMMUV3(bs);
+ SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
+ SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
+
+ if (!idev) {
+ return true;
+ }
+
+ if (accel_dev->idev) {
+ if (accel_dev->idev != idev) {
+ error_setg(errp, "Device already has an associated idev 0x%x",
+ idev->devid);
+ return false;
+ }
+ return true;
+ }
+
+ if (s->s_accel->viommu) {
+ goto done;
+ }
+
+ if (!smmuv3_accel_alloc_viommu(s, idev, errp)) {
+ error_append_hint(errp, "Unable to alloc vIOMMU: idev devid 0x%x: ",
+ idev->devid);
+ return false;
+ }
+
+done:
+ accel_dev->idev = idev;
+ accel_dev->s_accel = s->s_accel;
+ QLIST_INSERT_HEAD(&s->s_accel->device_list, accel_dev, next);
+ trace_smmuv3_accel_set_iommu_device(devfn, idev->devid);
+ return true;
+}
+
+static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
+ int devfn)
+{
+ SMMUState *bs = opaque;
+ SMMUPciBus *sbus = g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus);
+ HostIOMMUDeviceIOMMUFD *idev;
+ SMMUv3AccelDevice *accel_dev;
+ SMMUv3AccelState *accel;
+ SMMUDevice *sdev;
+
+ if (!sbus) {
+ return;
+ }
+
+ sdev = sbus->pbdev[devfn];
+ if (!sdev) {
+ return;
+ }
+
+ accel_dev = container_of(sdev, SMMUv3AccelDevice, sdev);
+ idev = accel_dev->idev;
+ accel = accel_dev->s_accel;
+ /* Re-attach the default s2 hwpt id */
+ if (!host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, NULL)) {
+ error_report("Unable to attach the default HW pagetable: idev devid "
+ "0x%x", idev->devid);
+ }
+
+ accel_dev->idev = NULL;
+ accel_dev->s_accel = NULL;
+ QLIST_REMOVE(accel_dev, next);
+ trace_smmuv3_accel_unset_iommu_device(devfn, idev->devid);
+
+ if (QLIST_EMPTY(&accel->device_list)) {
+ iommufd_backend_free_id(accel->viommu->iommufd, accel->bypass_hwpt_id);
+ iommufd_backend_free_id(accel->viommu->iommufd, accel->abort_hwpt_id);
+ iommufd_backend_free_id(accel->viommu->iommufd,
+ accel->viommu->viommu_id);
+ g_free(accel->viommu);
+ accel->viommu = NULL;
+ }
+}
+
/*
* Only allow PCIe bridges, pxb-pcie roots, and GPEX roots so vfio-pci
* endpoints can sit downstream. Accelerated SMMUv3 requires a vfio-pci
@@ -145,6 +298,8 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.supports_address_space = smmuv3_accel_supports_as,
.get_address_space = smmuv3_accel_find_add_as,
.get_viommu_flags = smmuv3_accel_get_viommu_flags,
+ .set_iommu_device = smmuv3_accel_set_iommu_device,
+ .unset_iommu_device = smmuv3_accel_unset_iommu_device,
};
static void smmuv3_accel_as_init(SMMUv3State *s)
@@ -168,6 +323,7 @@ void smmuv3_accel_init(SMMUv3State *s)
{
SMMUState *bs = ARM_SMMU(s);
+ s->s_accel = g_new0(SMMUv3AccelState, 1);
bs->iommu_ops = &smmuv3_accel_ops;
smmuv3_accel_as_init(s);
}
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index 0dc6b00d35..efb631db4f 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -10,10 +10,28 @@
#define HW_ARM_SMMUV3_ACCEL_H
#include "hw/arm/smmu-common.h"
+#include "system/iommufd.h"
+#ifdef CONFIG_LINUX
+#include <linux/iommufd.h>
+#endif
#include CONFIG_DEVICES
+/*
+ * Represents an accelerated SMMU instance backed by an iommufd vIOMMU object.
+ * Holds bypass and abort proxy HWPT IDs used for device attachment.
+ */
+typedef struct SMMUv3AccelState {
+ IOMMUFDViommu *viommu;
+ uint32_t bypass_hwpt_id;
+ uint32_t abort_hwpt_id;
+ QLIST_HEAD(, SMMUv3AccelDevice) device_list;
+} SMMUv3AccelState;
+
typedef struct SMMUv3AccelDevice {
SMMUDevice sdev;
+ HostIOMMUDeviceIOMMUFD *idev;
+ QLIST_ENTRY(SMMUv3AccelDevice) next;
+ SMMUv3AccelState *s_accel;
} SMMUv3AccelDevice;
#ifdef CONFIG_ARM_SMMUV3_ACCEL
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index f3386bd7ae..2aaa0c40c7 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -66,6 +66,10 @@ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s
smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
smmu_reset_exit(void) ""
+#smmuv3-accel.c
+smmuv3_accel_set_iommu_device(int devfn, uint32_t devid) "devfn=0x%x (idev devid=0x%x)"
+smmuv3_accel_unset_iommu_device(int devfn, uint32_t devid) "devfn=0x%x (idev devid=0x%x)"
+
# strongarm.c
strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
strongarm_ssp_read_underrun(void) "SSP rx underrun"
diff --git a/include/hw/arm/smmuv3-common.h b/include/hw/arm/smmuv3-common.h
index f644618f38..153310248d 100644
--- a/include/hw/arm/smmuv3-common.h
+++ b/include/hw/arm/smmuv3-common.h
@@ -100,6 +100,9 @@ REG32(STE_7, 28)
#define STE_CFG_ABORT(config) (!(config & 0x4))
#define STE_CFG_BYPASS(config) (config == 0x4)
+#define SMMU_STE_VALID (1ULL << 0)
+#define SMMU_STE_CFG_BYPASS (1ULL << 3)
+
/* Update STE fields */
#define STE_SET_VALID(ste, v) \
((ste)->word[0] = FIELD_DP32((ste)->word[0], STE_0, VALID, (v)))
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index bb7076286b..e54ece2d38 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -66,6 +66,7 @@ struct SMMUv3State {
/* SMMU has HW accelerator support for nested S1 + s2 */
bool accel;
+ struct SMMUv3AccelState *s_accel;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
2026-01-26 10:43 ` [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback Shameer Kolothum
@ 2026-04-15 13:02 ` Anton Kuchin
2026-04-15 14:59 ` Shameer Kolothum Thodi
0 siblings, 1 reply; 68+ messages in thread
From: Anton Kuchin @ 2026-04-15 13:02 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm@nongnu.org, qemu-devel@nongnu.org
Cc: eric.auger@redhat.com, peter.maydell@linaro.org, jgg@nvidia.com,
nicolinc@nvidia.com, ddutile@redhat.com, berrange@redhat.com,
clg@redhat.com, alex@shazbot.org, nathanc@nvidia.com,
mochs@nvidia.com, smostafa@google.com, wangzhou1@hisilicon.com,
jiangkunkun@huawei.com, jonathan.cameron@huawei.com,
zhangfei.gao@linaro.org, zhenzhong.duan@intel.com,
yi.l.liu@intel.com, kjaju@nvidia.com, NB-Core Team
Hello! It seems to me that there is a problems with this patch and I
want to check if I understand this correctly or maybe I'm just doing
something wrong.
Current master 11.0.0-rc4 doesn't work for multiple vfio devices behind
the same expander with accelerated SMMU. Qemu fails to start with "vfio
0009:03:00.0: [iommufd=250] error attach 0009:03:00.0 (257) to id=9:
Invalid argument" error.
I've tested v4 of this series and it worked great. Between v5 and v6
attach of HWPT based on SMMUv3 GBPA.ABORT value was added to
smmuv3_accel_alloc_viommu() and I suspect this to be the root cause.
More details inline.
Not sure this message will properly attach to the original thread, so
here is the link just in case:
https://lore.kernel.org/qemu-devel/20260126104342.253965-13-skolothumtho@nvidia.com/
On 26/01/2026 11:43, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> Implement the VFIO/PCI callbacks to attach and detach a HostIOMMUDevice
> to a vSMMUv3 when accel=on,
>
> - set_iommu_device(): attach a HostIOMMUDevice to a vIOMMU
> - unset_iommu_device(): detach and release associated resources
>
> In SMMUv3 accel=on mode, the guest SMMUv3 is backed by the host SMMUv3 via
> IOMMUFD. A vIOMMU object (created via IOMMU_VIOMMU_ALLOC) provides a per-VM,
> security-isolated handle to the physical SMMUv3. Without a vIOMMU, the
> vSMMUv3 cannot relay guest operations to the host hardware nor maintain
> isolation across VMs or devices. Therefore, set_iommu_device() allocates
> a vIOMMU object if one does not already exist.
>
> There are two main points to consider in this implementation:
>
> 1) VFIO core allocates and attaches a S2 HWPT that acts as the nesting
> parent for nested HWPTs(IOMMU_DOMAIN_NESTED). This parent HWPT will
> be shared across multiple vSMMU instances within a VM.
>
> 2) A device cannot attach directly to a vIOMMU. Instead, it attaches
> through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). Based on the STE
> configuration,there are three types of nested HWPTs: bypass, abort,
> and translate.
> -The bypass and abort proxy HWPTs are pre-allocated. When SMMUv3
> operates in global abort or bypass modes, as controlled by the GBPA
> register, or issues a vSTE for bypass or abort we attach these
> pre-allocated nested HWPTs.
> -The translate HWPT requires a vDEVICE to be allocated first, since
> invalidations and events depend on a valid vSID.
> -The vDEVICE allocation and attach operations for vSTE based HWPTs
> are implemented in subsequent patches.
>
> In summary, a device placed behind a vSMMU instance must have a vSID for
> translate vSTE. The bypass and abort vSTEs are pre-allocated as proxy
> nested HWPTs and is attached based on GBPA register. The core-managed
> nesting parent S2 HWPT is used as parent S2 HWPT for all the nested
> HWPTs and is intended to be shared across vSMMU instances within the
> same VM.
>
> set_iommu_device():
> - Reuse an existing vIOMMU for the same physical SMMU if available.
> If not, allocate a new one using the nesting parent S2 HWPT.
> - Pre-allocate two proxy nested HWPTs (bypass and abort) under the
> vIOMMU and install one based on GBPA.ABORT value.
> - Add the device to the vIOMMU’s device list.
>
> unset_iommu_device():
> - Re-attach device to the nesting parent S2 HWPT.
> - Remove the device from the vIOMMU’s device list.
> - If the list is empty, free the proxy HWPTs (bypass and abort)
> and release the vIOMMU object.
>
> Introduce struct SMMUv3AccelState, representing an accelerated SMMUv3
> instance backed by an iommufd vIOMMU object, and storing the bypass and
> abort proxy HWPT IDs.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/arm/smmuv3-accel.c | 156 +++++++++++++++++++++++++++++++++
> hw/arm/smmuv3-accel.h | 18 ++++
> hw/arm/trace-events | 4 +
> include/hw/arm/smmuv3-common.h | 3 +
> include/hw/arm/smmuv3.h | 1 +
> 5 files changed, 182 insertions(+)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index be09cf8b73..9c2b917a11 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -8,6 +8,7 @@
>
> #include "qemu/osdep.h"
> #include "qemu/error-report.h"
> +#include "trace.h"
>
> #include "hw/arm/smmuv3.h"
> #include "hw/core/iommu.h"
> @@ -15,6 +16,7 @@
> #include "hw/pci-host/gpex.h"
> #include "hw/vfio/pci.h"
>
> +#include "smmuv3-internal.h"
> #include "smmuv3-accel.h"
>
> /*
> @@ -43,6 +45,157 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
> return accel_dev;
> }
>
> +static uint32_t smmuv3_accel_gbpa_hwpt(SMMUv3State *s, SMMUv3AccelState *accel)
> +{
> + return FIELD_EX32(s->gbpa, GBPA, ABORT) ?
> + accel->abort_hwpt_id : accel->bypass_hwpt_id;
> +}
> +
> +static bool
> +smmuv3_accel_alloc_viommu(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
> + Error **errp)
> +{
> + SMMUv3AccelState *accel = s->s_accel;
> + struct iommu_hwpt_arm_smmuv3 bypass_data = {
> + .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
> + };
> + struct iommu_hwpt_arm_smmuv3 abort_data = {
> + .ste = { SMMU_STE_VALID, 0x0ULL },
> + };
> + uint32_t s2_hwpt_id = idev->hwpt_id;
> + uint32_t viommu_id, hwpt_id;
> + IOMMUFDViommu *viommu;
> +
> + if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
> + IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
> + s2_hwpt_id, &viommu_id, errp)) {
> + return false;
> + }
> +
> + viommu = g_new0(IOMMUFDViommu, 1);
> + viommu->viommu_id = viommu_id;
> + viommu->s2_hwpt_id = s2_hwpt_id;
> + viommu->iommufd = idev->iommufd;
> +
> + /*
> + * Pre-allocate HWPTs for S1 bypass and abort cases. These will be attached
> + * later for guest STEs or GBPAs that require bypass or abort configuration.
> + */
> + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, viommu_id,
> + 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
> + sizeof(abort_data), &abort_data,
> + &accel->abort_hwpt_id, errp)) {
> + goto free_viommu;
> + }
> +
> + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, viommu_id,
> + 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
> + sizeof(bypass_data), &bypass_data,
> + &accel->bypass_hwpt_id, errp)) {
> + goto free_abort_hwpt;
> + }
> +
> + /* Attach a HWPT based on SMMUv3 GBPA.ABORT value */
> + hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
> + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
> + goto free_bypass_hwpt;
> + }
My research points to this change. Here we attach
IOMMU_HWPT_DATA_ARM_SMMUV3 abort HWPT to device, replacing the default
IOMMU_HWPT_DATA_NONE one allocated and attached earlier in
iommufd_cdev_autodomains_get(). The problem I see is that when the
second device in the same group initializes its default HWPT it no
longer matches the updated abort HWPT in kernel, so device allocates a
new IOAS and HWPT but kernel returns error for attach HWPT request
because iommufd_group in kernel already has different HWPT assigned.
How it worked before v6: during initialization phase abort and bypass
HWPTs were allocated and stored for vIOMMU, but not attached to devices.
Devices used default IOMMU_HWPT_DATA_NONE HWPTs to be properly assigned
to iommufd_groups, and then at the end of machine creation
qdev_machine_creation_done() calls smmuv3_accel_reset() which attaches
same abort HWPT for all devices in the group.
Could you please confirm my understanding is correct?
> + accel->viommu = viommu;
> + return true;
> +
> +free_bypass_hwpt:
> + iommufd_backend_free_id(idev->iommufd, accel->bypass_hwpt_id);
> +free_abort_hwpt:
> + iommufd_backend_free_id(idev->iommufd, accel->abort_hwpt_id);
> +free_viommu:
> + iommufd_backend_free_id(idev->iommufd, viommu->viommu_id);
> + g_free(viommu);
> + return false;
> +}
> +
> +static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
> + HostIOMMUDevice *hiod, Error **errp)
> +{
> + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
> + SMMUState *bs = opaque;
> + SMMUv3State *s = ARM_SMMUV3(bs);
> + SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
> + SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
> +
> + if (!idev) {
> + return true;
> + }
> +
> + if (accel_dev->idev) {
> + if (accel_dev->idev != idev) {
> + error_setg(errp, "Device already has an associated idev 0x%x",
> + idev->devid);
> + return false;
> + }
> + return true;
> + }
> +
> + if (s->s_accel->viommu) {
> + goto done;
> + }
> +
> + if (!smmuv3_accel_alloc_viommu(s, idev, errp)) {
> + error_append_hint(errp, "Unable to alloc vIOMMU: idev devid 0x%x: ",
> + idev->devid);
> + return false;
> + }
> +
> +done:
> + accel_dev->idev = idev;
> + accel_dev->s_accel = s->s_accel;
> + QLIST_INSERT_HEAD(&s->s_accel->device_list, accel_dev, next);
> + trace_smmuv3_accel_set_iommu_device(devfn, idev->devid);
> + return true;
> +}
> +
> +static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
> + int devfn)
> +{
> + SMMUState *bs = opaque;
> + SMMUPciBus *sbus = g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus);
> + HostIOMMUDeviceIOMMUFD *idev;
> + SMMUv3AccelDevice *accel_dev;
> + SMMUv3AccelState *accel;
> + SMMUDevice *sdev;
> +
> + if (!sbus) {
> + return;
> + }
> +
> + sdev = sbus->pbdev[devfn];
> + if (!sdev) {
> + return;
> + }
> +
> + accel_dev = container_of(sdev, SMMUv3AccelDevice, sdev);
> + idev = accel_dev->idev;
> + accel = accel_dev->s_accel;
> + /* Re-attach the default s2 hwpt id */
> + if (!host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, NULL)) {
> + error_report("Unable to attach the default HW pagetable: idev devid "
> + "0x%x", idev->devid);
> + }
> +
> + accel_dev->idev = NULL;
> + accel_dev->s_accel = NULL;
> + QLIST_REMOVE(accel_dev, next);
> + trace_smmuv3_accel_unset_iommu_device(devfn, idev->devid);
> +
> + if (QLIST_EMPTY(&accel->device_list)) {
> + iommufd_backend_free_id(accel->viommu->iommufd, accel->bypass_hwpt_id);
> + iommufd_backend_free_id(accel->viommu->iommufd, accel->abort_hwpt_id);
> + iommufd_backend_free_id(accel->viommu->iommufd,
> + accel->viommu->viommu_id);
> + g_free(accel->viommu);
> + accel->viommu = NULL;
> + }
> +}
> +
> /*
> * Only allow PCIe bridges, pxb-pcie roots, and GPEX roots so vfio-pci
> * endpoints can sit downstream. Accelerated SMMUv3 requires a vfio-pci
> @@ -145,6 +298,8 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
> .supports_address_space = smmuv3_accel_supports_as,
> .get_address_space = smmuv3_accel_find_add_as,
> .get_viommu_flags = smmuv3_accel_get_viommu_flags,
> + .set_iommu_device = smmuv3_accel_set_iommu_device,
> + .unset_iommu_device = smmuv3_accel_unset_iommu_device,
> };
>
> static void smmuv3_accel_as_init(SMMUv3State *s)
> @@ -168,6 +323,7 @@ void smmuv3_accel_init(SMMUv3State *s)
> {
> SMMUState *bs = ARM_SMMU(s);
>
> + s->s_accel = g_new0(SMMUv3AccelState, 1);
> bs->iommu_ops = &smmuv3_accel_ops;
> smmuv3_accel_as_init(s);
> }
> diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
> index 0dc6b00d35..efb631db4f 100644
> --- a/hw/arm/smmuv3-accel.h
> +++ b/hw/arm/smmuv3-accel.h
> @@ -10,10 +10,28 @@
> #define HW_ARM_SMMUV3_ACCEL_H
>
> #include "hw/arm/smmu-common.h"
> +#include "system/iommufd.h"
> +#ifdef CONFIG_LINUX
> +#include <linux/iommufd.h>
> +#endif
> #include CONFIG_DEVICES
>
> +/*
> + * Represents an accelerated SMMU instance backed by an iommufd vIOMMU object.
> + * Holds bypass and abort proxy HWPT IDs used for device attachment.
> + */
> +typedef struct SMMUv3AccelState {
> + IOMMUFDViommu *viommu;
> + uint32_t bypass_hwpt_id;
> + uint32_t abort_hwpt_id;
> + QLIST_HEAD(, SMMUv3AccelDevice) device_list;
> +} SMMUv3AccelState;
> +
> typedef struct SMMUv3AccelDevice {
> SMMUDevice sdev;
> + HostIOMMUDeviceIOMMUFD *idev;
> + QLIST_ENTRY(SMMUv3AccelDevice) next;
> + SMMUv3AccelState *s_accel;
> } SMMUv3AccelDevice;
>
> #ifdef CONFIG_ARM_SMMUV3_ACCEL
> diff --git a/hw/arm/trace-events b/hw/arm/trace-events
> index f3386bd7ae..2aaa0c40c7 100644
> --- a/hw/arm/trace-events
> +++ b/hw/arm/trace-events
> @@ -66,6 +66,10 @@ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s
> smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
> smmu_reset_exit(void) ""
>
> +#smmuv3-accel.c
> +smmuv3_accel_set_iommu_device(int devfn, uint32_t devid) "devfn=0x%x (idev devid=0x%x)"
> +smmuv3_accel_unset_iommu_device(int devfn, uint32_t devid) "devfn=0x%x (idev devid=0x%x)"
> +
> # strongarm.c
> strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
> strongarm_ssp_read_underrun(void) "SSP rx underrun"
> diff --git a/include/hw/arm/smmuv3-common.h b/include/hw/arm/smmuv3-common.h
> index f644618f38..153310248d 100644
> --- a/include/hw/arm/smmuv3-common.h
> +++ b/include/hw/arm/smmuv3-common.h
> @@ -100,6 +100,9 @@ REG32(STE_7, 28)
> #define STE_CFG_ABORT(config) (!(config & 0x4))
> #define STE_CFG_BYPASS(config) (config == 0x4)
>
> +#define SMMU_STE_VALID (1ULL << 0)
> +#define SMMU_STE_CFG_BYPASS (1ULL << 3)
> +
> /* Update STE fields */
> #define STE_SET_VALID(ste, v) \
> ((ste)->word[0] = FIELD_DP32((ste)->word[0], STE_0, VALID, (v)))
> diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
> index bb7076286b..e54ece2d38 100644
> --- a/include/hw/arm/smmuv3.h
> +++ b/include/hw/arm/smmuv3.h
> @@ -66,6 +66,7 @@ struct SMMUv3State {
>
> /* SMMU has HW accelerator support for nested S1 + s2 */
> bool accel;
> + struct SMMUv3AccelState *s_accel;
> };
>
> typedef enum {
^ permalink raw reply [flat|nested] 68+ messages in thread* RE: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
2026-04-15 13:02 ` Anton Kuchin
@ 2026-04-15 14:59 ` Shameer Kolothum Thodi
2026-04-15 16:18 ` Anton Kuchin
0 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-04-15 14:59 UTC (permalink / raw)
To: Anton Kuchin, qemu-arm@nongnu.org, qemu-devel@nongnu.org
Cc: eric.auger@redhat.com, peter.maydell@linaro.org, Jason Gunthorpe,
Nicolin Chen, ddutile@redhat.com, berrange@redhat.com,
clg@redhat.com, alex@shazbot.org, Nathan Chen, Matt Ochs,
smostafa@google.com, wangzhou1@hisilicon.com,
jiangkunkun@huawei.com, jonathan.cameron@huawei.com,
zhangfei.gao@linaro.org, zhenzhong.duan@intel.com,
yi.l.liu@intel.com, Krishnakant Jaju, NB-Core Team
> -----Original Message-----
> From: Anton Kuchin <antonkuchin@nebius.com>
> Sent: 15 April 2026 14:03
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
> berrange@redhat.com; clg@redhat.com; alex@shazbot.org; Nathan Chen
> <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
> smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> Krishnakant Jaju <kjaju@nvidia.com>; NB-Core Team <NB-
> CoreTeam@nebius.com>
> Subject: Re: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add
> set/unset_iommu_device callback
>
> External email: Use caution opening links or attachments
>
>
> Hello! It seems to me that there is a problems with this patch and I
> want to check if I understand this correctly or maybe I'm just doing
> something wrong.
>
> Current master 11.0.0-rc4 doesn't work for multiple vfio devices behind
> the same expander with accelerated SMMU. Qemu fails to start with "vfio
> 0009:03:00.0: [iommufd=250] error attach 0009:03:00.0 (257) to id=9:
> Invalid argument" error.
>
> I've tested v4 of this series and it worked great. Between v5 and v6
> attach of HWPT based on SMMUv3 GBPA.ABORT value was added to
> smmuv3_accel_alloc_viommu() and I suspect this to be the root cause.
> More details inline.
>
>
> Not sure this message will properly attach to the original thread, so
> here is the link just in case:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.
> kernel.org%2Fqemu-devel%2F20260126104342.253965-13-
> skolothumtho%40nvidia.com%2F&data=05%7C02%7Cskolothumtho%40nvi
> dia.com%7C0c6fd4366c8344b59fb008de9aef4554%7C43083d15727340c1
> b7db39efd9ccc17a%7C0%7C0%7C639118549639467993%7CUnknown%7
> CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOi
> JXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=
> fTpbtuKL5OXtvt0XpO12%2FpE7PIx%2Fznyqe%2B7TfBy8DAA%3D&reserved=
> 0
>
> On 26/01/2026 11:43, Shameer Kolothum wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> >
> > Implement the VFIO/PCI callbacks to attach and detach a HostIOMMUDevice
> > to a vSMMUv3 when accel=on,
> >
> > - set_iommu_device(): attach a HostIOMMUDevice to a vIOMMU
> > - unset_iommu_device(): detach and release associated resources
> >
> > In SMMUv3 accel=on mode, the guest SMMUv3 is backed by the host
> SMMUv3 via
> > IOMMUFD. A vIOMMU object (created via IOMMU_VIOMMU_ALLOC)
> provides a per-VM,
> > security-isolated handle to the physical SMMUv3. Without a vIOMMU, the
> > vSMMUv3 cannot relay guest operations to the host hardware nor maintain
> > isolation across VMs or devices. Therefore, set_iommu_device() allocates
> > a vIOMMU object if one does not already exist.
> >
> > There are two main points to consider in this implementation:
> >
> > 1) VFIO core allocates and attaches a S2 HWPT that acts as the nesting
> > parent for nested HWPTs(IOMMU_DOMAIN_NESTED). This parent HWPT
> will
> > be shared across multiple vSMMU instances within a VM.
> >
> > 2) A device cannot attach directly to a vIOMMU. Instead, it attaches
> > through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). Based on the
> STE
> > configuration,there are three types of nested HWPTs: bypass, abort,
> > and translate.
> > -The bypass and abort proxy HWPTs are pre-allocated. When SMMUv3
> > operates in global abort or bypass modes, as controlled by the GBPA
> > register, or issues a vSTE for bypass or abort we attach these
> > pre-allocated nested HWPTs.
> > -The translate HWPT requires a vDEVICE to be allocated first, since
> > invalidations and events depend on a valid vSID.
> > -The vDEVICE allocation and attach operations for vSTE based HWPTs
> > are implemented in subsequent patches.
> >
> > In summary, a device placed behind a vSMMU instance must have a vSID for
> > translate vSTE. The bypass and abort vSTEs are pre-allocated as proxy
> > nested HWPTs and is attached based on GBPA register. The core-managed
> > nesting parent S2 HWPT is used as parent S2 HWPT for all the nested
> > HWPTs and is intended to be shared across vSMMU instances within the
> > same VM.
> >
> > set_iommu_device():
> > - Reuse an existing vIOMMU for the same physical SMMU if available.
> > If not, allocate a new one using the nesting parent S2 HWPT.
> > - Pre-allocate two proxy nested HWPTs (bypass and abort) under the
> > vIOMMU and install one based on GBPA.ABORT value.
> > - Add the device to the vIOMMU's device list.
> >
> > unset_iommu_device():
> > - Re-attach device to the nesting parent S2 HWPT.
> > - Remove the device from the vIOMMU's device list.
> > - If the list is empty, free the proxy HWPTs (bypass and abort)
> > and release the vIOMMU object.
> >
> > Introduce struct SMMUv3AccelState, representing an accelerated SMMUv3
> > instance backed by an iommufd vIOMMU object, and storing the bypass and
> > abort proxy HWPT IDs.
> >
> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> > Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com
> > Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> > Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> > Reviewed-by: Eric Auger <eric.auger@redhat.com>
> > Tested-by: Eric Auger <eric.auger@redhat.com>
> > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> > ---
> > hw/arm/smmuv3-accel.c | 156
> +++++++++++++++++++++++++++++++++
> > hw/arm/smmuv3-accel.h | 18 ++++
> > hw/arm/trace-events | 4 +
> > include/hw/arm/smmuv3-common.h | 3 +
> > include/hw/arm/smmuv3.h | 1 +
> > 5 files changed, 182 insertions(+)
> >
> > diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> > index be09cf8b73..9c2b917a11 100644
> > --- a/hw/arm/smmuv3-accel.c
> > +++ b/hw/arm/smmuv3-accel.c
> > @@ -8,6 +8,7 @@
> >
> > #include "qemu/osdep.h"
> > #include "qemu/error-report.h"
> > +#include "trace.h"
> >
> > #include "hw/arm/smmuv3.h"
> > #include "hw/core/iommu.h"
> > @@ -15,6 +16,7 @@
> > #include "hw/pci-host/gpex.h"
> > #include "hw/vfio/pci.h"
> >
> > +#include "smmuv3-internal.h"
> > #include "smmuv3-accel.h"
> >
> > /*
> > @@ -43,6 +45,157 @@ static SMMUv3AccelDevice
> *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
> > return accel_dev;
> > }
> >
> > +static uint32_t smmuv3_accel_gbpa_hwpt(SMMUv3State *s,
> SMMUv3AccelState *accel)
> > +{
> > + return FIELD_EX32(s->gbpa, GBPA, ABORT) ?
> > + accel->abort_hwpt_id : accel->bypass_hwpt_id;
> > +}
> > +
> > +static bool
> > +smmuv3_accel_alloc_viommu(SMMUv3State *s,
> HostIOMMUDeviceIOMMUFD *idev,
> > + Error **errp)
> > +{
> > + SMMUv3AccelState *accel = s->s_accel;
> > + struct iommu_hwpt_arm_smmuv3 bypass_data = {
> > + .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
> > + };
> > + struct iommu_hwpt_arm_smmuv3 abort_data = {
> > + .ste = { SMMU_STE_VALID, 0x0ULL },
> > + };
> > + uint32_t s2_hwpt_id = idev->hwpt_id;
> > + uint32_t viommu_id, hwpt_id;
> > + IOMMUFDViommu *viommu;
> > +
> > + if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
> > + IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
> > + s2_hwpt_id, &viommu_id, errp)) {
> > + return false;
> > + }
> > +
> > + viommu = g_new0(IOMMUFDViommu, 1);
> > + viommu->viommu_id = viommu_id;
> > + viommu->s2_hwpt_id = s2_hwpt_id;
> > + viommu->iommufd = idev->iommufd;
> > +
> > + /*
> > + * Pre-allocate HWPTs for S1 bypass and abort cases. These will be
> attached
> > + * later for guest STEs or GBPAs that require bypass or abort
> configuration.
> > + */
> > + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> viommu_id,
> > + 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
> > + sizeof(abort_data), &abort_data,
> > + &accel->abort_hwpt_id, errp)) {
> > + goto free_viommu;
> > + }
> > +
> > + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> viommu_id,
> > + 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
> > + sizeof(bypass_data), &bypass_data,
> > + &accel->bypass_hwpt_id, errp)) {
> > + goto free_abort_hwpt;
> > + }
> > +
> > + /* Attach a HWPT based on SMMUv3 GBPA.ABORT value */
> > + hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
> > + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
> > + goto free_bypass_hwpt;
> > + }
>
> My research points to this change. Here we attach
> IOMMU_HWPT_DATA_ARM_SMMUV3 abort HWPT to device, replacing the
> default
> IOMMU_HWPT_DATA_NONE one allocated and attached earlier in
> iommufd_cdev_autodomains_get(). The problem I see is that when the
> second device in the same group initializes its default HWPT it no
> longer matches the updated abort HWPT in kernel, so device allocates a
> new IOAS and HWPT but kernel returns error for attach HWPT request
> because iommufd_group in kernel already has different HWPT assigned.
Thanks for debugging.
Just to clarify, when you say "multiple vfio devices behind the same
expander", do you mean they are placed under the same QEMU pxb-pcie
expander, or that they are also under the same physical SMMU
on the host?
If the passthrough devices are not under the same physical SMMUv3,
then a shared vSMMU will not work, as the kernel currently does not
support sharing the nesting parent domain (S2) across different
physical SMMUv3 instances. In that case, the expected topology is
one vSMMU per physical SMMU.
Could you please confirm the host topology?
>
> How it worked before v6: during initialization phase abort and bypass
> HWPTs were allocated and stored for vIOMMU, but not attached to devices.
> Devices used default IOMMU_HWPT_DATA_NONE HWPTs to be properly
> assigned
> to iommufd_groups, and then at the end of machine creation
> qdev_machine_creation_done() calls smmuv3_accel_reset() which attaches
> same abort HWPT for all devices in the group.
> Could you please confirm my understanding is correct?
It is true that QEMU behaviour changed between v5 and v6.
From v6 onwards we try attaching a GBPA based HWPT as soon as vIOMMU
is allocated. And at this instance, we are not sure of the vSID of the device and
cannot reliably do an iommufd_backend_alloc_vdev(). So, it requires the below
patch in host kernel to be a valid attachment.
"iommu/arm-smmu-v3-iommufd: Allow attaching nested domain for GBPA cases"
https://lore.kernel.org/all/20251103172755.2026145-1-nicolinc@nvidia.com/
But if the above patch is missing, you should see error with the v9 series(or from v6
onwards) with only one device assignment itself.
Could you please confirm your host kernel includes the above?
If all the above cases are not valid, could you please provide the QEMU
cmdline used, lspci -tv output from host as well as host kernel version
used?
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread* Re: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
2026-04-15 14:59 ` Shameer Kolothum Thodi
@ 2026-04-15 16:18 ` Anton Kuchin
2026-04-15 17:49 ` Shameer Kolothum Thodi
0 siblings, 1 reply; 68+ messages in thread
From: Anton Kuchin @ 2026-04-15 16:18 UTC (permalink / raw)
To: Shameer Kolothum Thodi, qemu-arm@nongnu.org,
qemu-devel@nongnu.org
Cc: eric.auger@redhat.com, peter.maydell@linaro.org, Jason Gunthorpe,
Nicolin Chen, ddutile@redhat.com, berrange@redhat.com,
clg@redhat.com, alex@shazbot.org, Nathan Chen, Matt Ochs,
smostafa@google.com, wangzhou1@hisilicon.com,
jiangkunkun@huawei.com, jonathan.cameron@huawei.com,
zhangfei.gao@linaro.org, zhenzhong.duan@intel.com,
yi.l.liu@intel.com, Krishnakant Jaju, NB-Core Team
On 15/04/2026 16:59, Shameer Kolothum Thodi wrote:
>
>> -----Original Message-----
>> From: Anton Kuchin <antonkuchin@nebius.com>
>> Sent: 15 April 2026 14:03
>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
>> arm@nongnu.org; qemu-devel@nongnu.org
>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
>> <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
>> berrange@redhat.com; clg@redhat.com; alex@shazbot.org; Nathan Chen
>> <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
>> smostafa@google.com; wangzhou1@hisilicon.com;
>> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
>> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
>> Krishnakant Jaju <kjaju@nvidia.com>; NB-Core Team <NB-
>> CoreTeam@nebius.com>
>> Subject: Re: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add
>> set/unset_iommu_device callback
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Hello! It seems to me that there is a problems with this patch and I
>> want to check if I understand this correctly or maybe I'm just doing
>> something wrong.
>>
>> Current master 11.0.0-rc4 doesn't work for multiple vfio devices behind
>> the same expander with accelerated SMMU. Qemu fails to start with "vfio
>> 0009:03:00.0: [iommufd=250] error attach 0009:03:00.0 (257) to id=9:
>> Invalid argument" error.
>>
>> I've tested v4 of this series and it worked great. Between v5 and v6
>> attach of HWPT based on SMMUv3 GBPA.ABORT value was added to
>> smmuv3_accel_alloc_viommu() and I suspect this to be the root cause.
>> More details inline.
>>
>>
>> Not sure this message will properly attach to the original thread, so
>> here is the link just in case:
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.
>> kernel.org%2Fqemu-devel%2F20260126104342.253965-13-
>> skolothumtho%40nvidia.com%2F&data=05%7C02%7Cskolothumtho%40nvi
>> dia.com%7C0c6fd4366c8344b59fb008de9aef4554%7C43083d15727340c1
>> b7db39efd9ccc17a%7C0%7C0%7C639118549639467993%7CUnknown%7
>> CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOi
>> JXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=
>> fTpbtuKL5OXtvt0XpO12%2FpE7PIx%2Fznyqe%2B7TfBy8DAA%3D&reserved=
>> 0
>>
>> On 26/01/2026 11:43, Shameer Kolothum wrote:
>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>>
>>> Implement the VFIO/PCI callbacks to attach and detach a HostIOMMUDevice
>>> to a vSMMUv3 when accel=on,
>>>
>>> - set_iommu_device(): attach a HostIOMMUDevice to a vIOMMU
>>> - unset_iommu_device(): detach and release associated resources
>>>
>>> In SMMUv3 accel=on mode, the guest SMMUv3 is backed by the host
>> SMMUv3 via
>>> IOMMUFD. A vIOMMU object (created via IOMMU_VIOMMU_ALLOC)
>> provides a per-VM,
>>> security-isolated handle to the physical SMMUv3. Without a vIOMMU, the
>>> vSMMUv3 cannot relay guest operations to the host hardware nor maintain
>>> isolation across VMs or devices. Therefore, set_iommu_device() allocates
>>> a vIOMMU object if one does not already exist.
>>>
>>> There are two main points to consider in this implementation:
>>>
>>> 1) VFIO core allocates and attaches a S2 HWPT that acts as the nesting
>>> parent for nested HWPTs(IOMMU_DOMAIN_NESTED). This parent HWPT
>> will
>>> be shared across multiple vSMMU instances within a VM.
>>>
>>> 2) A device cannot attach directly to a vIOMMU. Instead, it attaches
>>> through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). Based on the
>> STE
>>> configuration,there are three types of nested HWPTs: bypass, abort,
>>> and translate.
>>> -The bypass and abort proxy HWPTs are pre-allocated. When SMMUv3
>>> operates in global abort or bypass modes, as controlled by the GBPA
>>> register, or issues a vSTE for bypass or abort we attach these
>>> pre-allocated nested HWPTs.
>>> -The translate HWPT requires a vDEVICE to be allocated first, since
>>> invalidations and events depend on a valid vSID.
>>> -The vDEVICE allocation and attach operations for vSTE based HWPTs
>>> are implemented in subsequent patches.
>>>
>>> In summary, a device placed behind a vSMMU instance must have a vSID for
>>> translate vSTE. The bypass and abort vSTEs are pre-allocated as proxy
>>> nested HWPTs and is attached based on GBPA register. The core-managed
>>> nesting parent S2 HWPT is used as parent S2 HWPT for all the nested
>>> HWPTs and is intended to be shared across vSMMU instances within the
>>> same VM.
>>>
>>> set_iommu_device():
>>> - Reuse an existing vIOMMU for the same physical SMMU if available.
>>> If not, allocate a new one using the nesting parent S2 HWPT.
>>> - Pre-allocate two proxy nested HWPTs (bypass and abort) under the
>>> vIOMMU and install one based on GBPA.ABORT value.
>>> - Add the device to the vIOMMU's device list.
>>>
>>> unset_iommu_device():
>>> - Re-attach device to the nesting parent S2 HWPT.
>>> - Remove the device from the vIOMMU's device list.
>>> - If the list is empty, free the proxy HWPTs (bypass and abort)
>>> and release the vIOMMU object.
>>>
>>> Introduce struct SMMUv3AccelState, representing an accelerated SMMUv3
>>> instance backed by an iommufd vIOMMU object, and storing the bypass and
>>> abort proxy HWPT IDs.
>>>
>>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>>> Signed-off-by: Shameer Kolothum
>> <shameerali.kolothum.thodi@huawei.com
>>> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
>>> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
>>> Reviewed-by: Eric Auger <eric.auger@redhat.com>
>>> Tested-by: Eric Auger <eric.auger@redhat.com>
>>> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
>>> ---
>>> hw/arm/smmuv3-accel.c | 156
>> +++++++++++++++++++++++++++++++++
>>> hw/arm/smmuv3-accel.h | 18 ++++
>>> hw/arm/trace-events | 4 +
>>> include/hw/arm/smmuv3-common.h | 3 +
>>> include/hw/arm/smmuv3.h | 1 +
>>> 5 files changed, 182 insertions(+)
>>>
>>> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
>>> index be09cf8b73..9c2b917a11 100644
>>> --- a/hw/arm/smmuv3-accel.c
>>> +++ b/hw/arm/smmuv3-accel.c
>>> @@ -8,6 +8,7 @@
>>>
>>> #include "qemu/osdep.h"
>>> #include "qemu/error-report.h"
>>> +#include "trace.h"
>>>
>>> #include "hw/arm/smmuv3.h"
>>> #include "hw/core/iommu.h"
>>> @@ -15,6 +16,7 @@
>>> #include "hw/pci-host/gpex.h"
>>> #include "hw/vfio/pci.h"
>>>
>>> +#include "smmuv3-internal.h"
>>> #include "smmuv3-accel.h"
>>>
>>> /*
>>> @@ -43,6 +45,157 @@ static SMMUv3AccelDevice
>> *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
>>> return accel_dev;
>>> }
>>>
>>> +static uint32_t smmuv3_accel_gbpa_hwpt(SMMUv3State *s,
>> SMMUv3AccelState *accel)
>>> +{
>>> + return FIELD_EX32(s->gbpa, GBPA, ABORT) ?
>>> + accel->abort_hwpt_id : accel->bypass_hwpt_id;
>>> +}
>>> +
>>> +static bool
>>> +smmuv3_accel_alloc_viommu(SMMUv3State *s,
>> HostIOMMUDeviceIOMMUFD *idev,
>>> + Error **errp)
>>> +{
>>> + SMMUv3AccelState *accel = s->s_accel;
>>> + struct iommu_hwpt_arm_smmuv3 bypass_data = {
>>> + .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
>>> + };
>>> + struct iommu_hwpt_arm_smmuv3 abort_data = {
>>> + .ste = { SMMU_STE_VALID, 0x0ULL },
>>> + };
>>> + uint32_t s2_hwpt_id = idev->hwpt_id;
>>> + uint32_t viommu_id, hwpt_id;
>>> + IOMMUFDViommu *viommu;
>>> +
>>> + if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
>>> + IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
>>> + s2_hwpt_id, &viommu_id, errp)) {
>>> + return false;
>>> + }
>>> +
>>> + viommu = g_new0(IOMMUFDViommu, 1);
>>> + viommu->viommu_id = viommu_id;
>>> + viommu->s2_hwpt_id = s2_hwpt_id;
>>> + viommu->iommufd = idev->iommufd;
>>> +
>>> + /*
>>> + * Pre-allocate HWPTs for S1 bypass and abort cases. These will be
>> attached
>>> + * later for guest STEs or GBPAs that require bypass or abort
>> configuration.
>>> + */
>>> + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>> viommu_id,
>>> + 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
>>> + sizeof(abort_data), &abort_data,
>>> + &accel->abort_hwpt_id, errp)) {
>>> + goto free_viommu;
>>> + }
>>> +
>>> + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>> viommu_id,
>>> + 0, IOMMU_HWPT_DATA_ARM_SMMUV3,
>>> + sizeof(bypass_data), &bypass_data,
>>> + &accel->bypass_hwpt_id, errp)) {
>>> + goto free_abort_hwpt;
>>> + }
>>> +
>>> + /* Attach a HWPT based on SMMUv3 GBPA.ABORT value */
>>> + hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
>>> + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
>>> + goto free_bypass_hwpt;
>>> + }
>> My research points to this change. Here we attach
>> IOMMU_HWPT_DATA_ARM_SMMUV3 abort HWPT to device, replacing the
>> default
>> IOMMU_HWPT_DATA_NONE one allocated and attached earlier in
>> iommufd_cdev_autodomains_get(). The problem I see is that when the
>> second device in the same group initializes its default HWPT it no
>> longer matches the updated abort HWPT in kernel, so device allocates a
>> new IOAS and HWPT but kernel returns error for attach HWPT request
>> because iommufd_group in kernel already has different HWPT assigned.
> Thanks for debugging.
>
> Just to clarify, when you say "multiple vfio devices behind the same
> expander", do you mean they are placed under the same QEMU pxb-pcie
> expander, or that they are also under the same physical SMMU
> on the host?
>
> If the passthrough devices are not under the same physical SMMUv3,
> then a shared vSMMU will not work, as the kernel currently does not
> support sharing the nesting parent domain (S2) across different
> physical SMMUv3 instances. In that case, the expected topology is
> one vSMMU per physical SMMU.
>
> Could you please confirm the host topology?
Yes, they are both under the same physical SMMU on the host and under
the same QEMU pxb-pcie (also in the same iommu group). I'm building one
vSMMU per pSMMU topology.
$ ls -al /sys/bus/pci/devices/0009:{06,03}:00.0/iommu/device
lrwxrwxrwx 1 root root 0 Apr 15 15:10
/sys/bus/pci/devices/0009:03:00.0/iommu/device ->
../../../arm-smmu-v3.13.auto
lrwxrwxrwx 1 root root 0 Apr 15 15:10
/sys/bus/pci/devices/0009:06:00.0/iommu/device ->
../../../arm-smmu-v3.13.auto
>
>> How it worked before v6: during initialization phase abort and bypass
>> HWPTs were allocated and stored for vIOMMU, but not attached to devices.
>> Devices used default IOMMU_HWPT_DATA_NONE HWPTs to be properly
>> assigned
>> to iommufd_groups, and then at the end of machine creation
>> qdev_machine_creation_done() calls smmuv3_accel_reset() which attaches
>> same abort HWPT for all devices in the group.
>> Could you please confirm my understanding is correct?
> It is true that QEMU behaviour changed between v5 and v6.
> From v6 onwards we try attaching a GBPA based HWPT as soon as vIOMMU
> is allocated. And at this instance, we are not sure of the vSID of the device and
> cannot reliably do an iommufd_backend_alloc_vdev(). So, it requires the below
> patch in host kernel to be a valid attachment.
>
> "iommu/arm-smmu-v3-iommufd: Allow attaching nested domain for GBPA cases"
> https://lore.kernel.org/all/20251103172755.2026145-1-nicolinc@nvidia.com/
Thank you! Our kernel does lack this patch, I'll try applying it and
check if this helps.
Could you please explain what is the purpose of this change. I couldn't
find an explanation
in mailing list discussions and genuinely curious.
>
> But if the above patch is missing, you should see error with the v9 series(or from v6
> onwards) with only one device assignment itself.
Which error should I get? In my case the VM starts OK with 8 expanders,
each one with vSMMU attached and one device like this:
-+-pxb---switch_up---switch_down---device
\-vSMMU
And I start getting errors only when I add more downstreams with devices.
> Could you please confirm your host kernel includes the above?
I'm using custom 6.14.8 with some backports. I think that I also tested
6.14.0-1015-nvidia-64k, but now I'm not sure about that.
>
> If all the above cases are not valid, could you please provide the QEMU
> cmdline used, lspci -tv output from host as well as host kernel version
> used?
>
> Thanks,
> Shameer
Thanks,
Anton
^ permalink raw reply [flat|nested] 68+ messages in thread* RE: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
2026-04-15 16:18 ` Anton Kuchin
@ 2026-04-15 17:49 ` Shameer Kolothum Thodi
0 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-04-15 17:49 UTC (permalink / raw)
To: Anton Kuchin, qemu-arm@nongnu.org, qemu-devel@nongnu.org
Cc: eric.auger@redhat.com, peter.maydell@linaro.org, Jason Gunthorpe,
Nicolin Chen, ddutile@redhat.com, berrange@redhat.com,
clg@redhat.com, alex@shazbot.org, Nathan Chen, Matt Ochs,
smostafa@google.com, wangzhou1@hisilicon.com,
jiangkunkun@huawei.com, jonathan.cameron@huawei.com,
zhangfei.gao@linaro.org, zhenzhong.duan@intel.com,
yi.l.liu@intel.com, Krishnakant Jaju, NB-Core Team
> -----Original Message-----
> From: Anton Kuchin <antonkuchin@nebius.com>
> Sent: 15 April 2026 17:19
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
> berrange@redhat.com; clg@redhat.com; alex@shazbot.org; Nathan Chen
> <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
> smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> Krishnakant Jaju <kjaju@nvidia.com>; NB-Core Team <NB-
> CoreTeam@nebius.com>
> Subject: Re: [PATCH v9 12/37] hw/arm/smmuv3-accel: Add
> set/unset_iommu_device callback
[...]
>
> >>> +
> >>> + /* Attach a HWPT based on SMMUv3 GBPA.ABORT value */
> >>> + hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
> >>> + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
> >>> + goto free_bypass_hwpt;
> >>> + }
> >> My research points to this change. Here we attach
> >> IOMMU_HWPT_DATA_ARM_SMMUV3 abort HWPT to device, replacing
> the
> >> default
> >> IOMMU_HWPT_DATA_NONE one allocated and attached earlier in
> >> iommufd_cdev_autodomains_get(). The problem I see is that when the
> >> second device in the same group initializes its default HWPT it no
> >> longer matches the updated abort HWPT in kernel, so device allocates a
> >> new IOAS and HWPT but kernel returns error for attach HWPT request
> >> because iommufd_group in kernel already has different HWPT assigned.
> > Thanks for debugging.
> >
> > Just to clarify, when you say "multiple vfio devices behind the same
> > expander", do you mean they are placed under the same QEMU pxb-pcie
> > expander, or that they are also under the same physical SMMU
> > on the host?
> >
> > If the passthrough devices are not under the same physical SMMUv3,
> > then a shared vSMMU will not work, as the kernel currently does not
> > support sharing the nesting parent domain (S2) across different
> > physical SMMUv3 instances. In that case, the expected topology is
> > one vSMMU per physical SMMU.
> >
> > Could you please confirm the host topology?
>
> Yes, they are both under the same physical SMMU on the host and under
> the same QEMU pxb-pcie (also in the same iommu group). I'm building one
> vSMMU per pSMMU topology.
Ok. Good to know that they are under same phys SMMUv3.
"also in the same iommu group". You mean same iommu group in host
kernel? Any other devices in the same Iommu group which are not assigned
to the same vSMMU?
Please do and share:
dmesg |grep Adding |grep iommu
> $ ls -al /sys/bus/pci/devices/0009:{06,03}:00.0/iommu/device
> lrwxrwxrwx 1 root root 0 Apr 15 15:10
> /sys/bus/pci/devices/0009:03:00.0/iommu/device ->
> ../../../arm-smmu-v3.13.auto
> lrwxrwxrwx 1 root root 0 Apr 15 15:10
> /sys/bus/pci/devices/0009:06:00.0/iommu/device ->
> ../../../arm-smmu-v3.13.auto
>
> >
> >> How it worked before v6: during initialization phase abort and bypass
> >> HWPTs were allocated and stored for vIOMMU, but not attached to devices.
> >> Devices used default IOMMU_HWPT_DATA_NONE HWPTs to be properly
> >> assigned
> >> to iommufd_groups, and then at the end of machine creation
> >> qdev_machine_creation_done() calls smmuv3_accel_reset() which attaches
> >> same abort HWPT for all devices in the group.
> >> Could you please confirm my understanding is correct?
> > It is true that QEMU behaviour changed between v5 and v6.
> > From v6 onwards we try attaching a GBPA based HWPT as soon as vIOMMU
> > is allocated. And at this instance, we are not sure of the vSID of the device
> and
> > cannot reliably do an iommufd_backend_alloc_vdev(). So, it requires the
> below
> > patch in host kernel to be a valid attachment.
> >
> > "iommu/arm-smmu-v3-iommufd: Allow attaching nested domain for GBPA
> cases"
> >
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.
> kernel.org%2Fall%2F20251103172755.2026145-1-
> nicolinc%40nvidia.com%2F&data=05%7C02%7Cskolothumtho%40nvidia.co
> m%7Cefaafe8b1e0741162c6608de9b0aa609%7C43083d15727340c1b7db3
> 9efd9ccc17a%7C0%7C0%7C639118667326949215%7CUnknown%7CTWFp
> bGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4
> zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=ecbY%
> 2Bs12Z1iVq4ugmwxotc5VRxe%2BUfejWfd4cHnBJks%3D&reserved=0
> Thank you! Our kernel does lack this patch, I'll try applying it and
> check if this helps.
> Could you please explain what is the purpose of this change. I couldn't
> find an explanation
> in mailing list discussions and genuinely curious.
This is to take care of the SMMU_GBPA register value which decides the
Global ByPass or Abort behaviour. Based on this register value we attach
a bypass or abort HWPT as soon as a vIOMMU is allocated.
> > But if the above patch is missing, you should see error with the v9 series(or
> from v6
> > onwards) with only one device assignment itself.
> Which error should I get?
Without the above I would expect QEMU to fail in smmuv3_accel_alloc_viommu() at:
> > + /* Attach a HWPT based on SMMUv3 GBPA.ABORT value */
> > + hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
> > + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
> > + goto free_bypass_hwpt;
> > + }
In my case the VM starts OK with 8 expanders,
> each one with vSMMU attached and one device like this:
>
> -+-pxb---switch_up---switch_down---device
> \-vSMMU
>
> And I start getting errors only when I add more downstreams with devices.
That is a bit confusing. As I said, if you lack the above kernel patch
it should fail on single device case as well.
> > Could you please confirm your host kernel includes the above?
> I'm using custom 6.14.8 with some backports. I think that I also tested
> 6.14.0-1015-nvidia-64k, but now I'm not sure about that.
I don't think 6.14.0-1015-nvidia-64k has the patch either.
Is it possible for you to try an upstream kernel(6.19 or >) or
Ubuntu-nvidia-6.17-6.17.0-1014 Kernel?
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 13/37] hw/arm/smmuv3: propagate smmuv3_cmdq_consume() errors to caller
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (11 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 12/37] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 14/37] hw/arm/smmuv3-accel: Add nested vSTE install/uninstall support Shameer Kolothum
` (24 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
smmuv3_cmdq_consume() is updated to return detailed errors via errp.
Although this is currently a no-op, it prepares the ground for accel
SMMUv3 specific command handling where proper error reporting will be
useful.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3.c | 67 +++++++++++++++++++++++++++----------------------
1 file changed, 37 insertions(+), 30 deletions(-)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 95d44f81ed..ade2b43ab8 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1279,7 +1279,7 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd, SMMUStage stage)
}
}
-static int smmuv3_cmdq_consume(SMMUv3State *s)
+static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
{
SMMUState *bs = ARM_SMMU(s);
SMMUCmdError cmd_error = SMMU_CERROR_NONE;
@@ -1547,42 +1547,44 @@ static MemTxResult smmu_writell(SMMUv3State *s, hwaddr offset,
static MemTxResult smmu_writel(SMMUv3State *s, hwaddr offset,
uint64_t data, MemTxAttrs attrs)
{
+ Error *local_err = NULL;
+
switch (offset) {
case A_CR0:
s->cr[0] = data;
s->cr0ack = data & ~SMMU_CR0_RESERVED;
/* in case the command queue has been enabled */
- smmuv3_cmdq_consume(s);
- return MEMTX_OK;
+ smmuv3_cmdq_consume(s, &local_err);
+ break;
case A_CR1:
s->cr[1] = data;
- return MEMTX_OK;
+ break;
case A_CR2:
s->cr[2] = data;
- return MEMTX_OK;
+ break;
case A_IRQ_CTRL:
s->irq_ctrl = data;
- return MEMTX_OK;
+ break;
case A_GERRORN:
smmuv3_write_gerrorn(s, data);
/*
* By acknowledging the CMDQ_ERR, SW may notify cmds can
* be processed again
*/
- smmuv3_cmdq_consume(s);
- return MEMTX_OK;
+ smmuv3_cmdq_consume(s, &local_err);
+ break;
case A_GERROR_IRQ_CFG0: /* 64b */
s->gerror_irq_cfg0 = deposit64(s->gerror_irq_cfg0, 0, 32, data);
- return MEMTX_OK;
+ break;
case A_GERROR_IRQ_CFG0 + 4:
s->gerror_irq_cfg0 = deposit64(s->gerror_irq_cfg0, 32, 32, data);
- return MEMTX_OK;
+ break;
case A_GERROR_IRQ_CFG1:
s->gerror_irq_cfg1 = data;
- return MEMTX_OK;
+ break;
case A_GERROR_IRQ_CFG2:
s->gerror_irq_cfg2 = data;
- return MEMTX_OK;
+ break;
case A_GBPA:
/*
* If UPDATE is not set, the write is ignored. This is the only
@@ -1592,71 +1594,76 @@ static MemTxResult smmu_writel(SMMUv3State *s, hwaddr offset,
/* Ignore update bit as write is synchronous. */
s->gbpa = data & ~R_GBPA_UPDATE_MASK;
}
- return MEMTX_OK;
+ break;
case A_STRTAB_BASE: /* 64b */
s->strtab_base = deposit64(s->strtab_base, 0, 32, data);
- return MEMTX_OK;
+ break;
case A_STRTAB_BASE + 4:
s->strtab_base = deposit64(s->strtab_base, 32, 32, data);
- return MEMTX_OK;
+ break;
case A_STRTAB_BASE_CFG:
s->strtab_base_cfg = data;
if (FIELD_EX32(data, STRTAB_BASE_CFG, FMT) == 1) {
s->sid_split = FIELD_EX32(data, STRTAB_BASE_CFG, SPLIT);
s->features |= SMMU_FEATURE_2LVL_STE;
}
- return MEMTX_OK;
+ break;
case A_CMDQ_BASE: /* 64b */
s->cmdq.base = deposit64(s->cmdq.base, 0, 32, data);
s->cmdq.log2size = extract64(s->cmdq.base, 0, 5);
if (s->cmdq.log2size > SMMU_CMDQS) {
s->cmdq.log2size = SMMU_CMDQS;
}
- return MEMTX_OK;
+ break;
case A_CMDQ_BASE + 4: /* 64b */
s->cmdq.base = deposit64(s->cmdq.base, 32, 32, data);
- return MEMTX_OK;
+ break;
case A_CMDQ_PROD:
s->cmdq.prod = data;
- smmuv3_cmdq_consume(s);
- return MEMTX_OK;
+ smmuv3_cmdq_consume(s, &local_err);
+ break;
case A_CMDQ_CONS:
s->cmdq.cons = data;
- return MEMTX_OK;
+ break;
case A_EVENTQ_BASE: /* 64b */
s->eventq.base = deposit64(s->eventq.base, 0, 32, data);
s->eventq.log2size = extract64(s->eventq.base, 0, 5);
if (s->eventq.log2size > SMMU_EVENTQS) {
s->eventq.log2size = SMMU_EVENTQS;
}
- return MEMTX_OK;
+ break;
case A_EVENTQ_BASE + 4:
s->eventq.base = deposit64(s->eventq.base, 32, 32, data);
- return MEMTX_OK;
+ break;
case A_EVENTQ_PROD:
s->eventq.prod = data;
- return MEMTX_OK;
+ break;
case A_EVENTQ_CONS:
s->eventq.cons = data;
- return MEMTX_OK;
+ break;
case A_EVENTQ_IRQ_CFG0: /* 64b */
s->eventq_irq_cfg0 = deposit64(s->eventq_irq_cfg0, 0, 32, data);
- return MEMTX_OK;
+ break;
case A_EVENTQ_IRQ_CFG0 + 4:
s->eventq_irq_cfg0 = deposit64(s->eventq_irq_cfg0, 32, 32, data);
- return MEMTX_OK;
+ break;
case A_EVENTQ_IRQ_CFG1:
s->eventq_irq_cfg1 = data;
- return MEMTX_OK;
+ break;
case A_EVENTQ_IRQ_CFG2:
s->eventq_irq_cfg2 = data;
- return MEMTX_OK;
+ break;
default:
qemu_log_mask(LOG_UNIMP,
"%s Unexpected 32-bit access to 0x%"PRIx64" (WI)\n",
__func__, offset);
- return MEMTX_OK;
+ break;
}
+
+ if (local_err) {
+ error_report_err(local_err);
+ }
+ return MEMTX_OK;
}
static MemTxResult smmu_write_mmio(void *opaque, hwaddr offset, uint64_t data,
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 14/37] hw/arm/smmuv3-accel: Add nested vSTE install/uninstall support
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (12 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 13/37] hw/arm/smmuv3: propagate smmuv3_cmdq_consume() errors to caller Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 15/37] hw/arm/smmuv3-accel: Install SMMUv3 GBPA based hwpt Shameer Kolothum
` (23 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
From: Nicolin Chen <nicolinc@nvidia.com>
A device placed behind a vSMMU instance must have corresponding vSTEs
(bypass, abort, or translate) installed. The bypass and abort proxy nested
HWPTs are pre-allocated.
For translat HWPT, a vDEVICE object is allocated and associated with the
vIOMMU for each guest device. This allows the host kernel to establish a
virtual SID to physical SID mapping, which is required for handling
invalidations and event reporting.
An translate HWPT is allocated based on the guest STE configuration and
attached to the device when the guest issues SMMU_CMD_CFGI_STE or
SMMU_CMD_CFGI_STE_RANGE, provided the STE enables S1 translation.
If the guest STE is invalid or S1 translation is disabled, the device is
attached to one of the pre-allocated ABORT or BYPASS HWPTs instead.
While at it, export smmu_find_ste() for use here.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 197 +++++++++++++++++++++++++++++++++
hw/arm/smmuv3-accel.h | 22 ++++
hw/arm/smmuv3-internal.h | 1 +
hw/arm/smmuv3.c | 11 +-
hw/arm/trace-events | 2 +
include/hw/arm/smmuv3-common.h | 18 +++
6 files changed, 249 insertions(+), 2 deletions(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 9c2b917a11..877b7e0e17 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -51,6 +51,188 @@ static uint32_t smmuv3_accel_gbpa_hwpt(SMMUv3State *s, SMMUv3AccelState *accel)
accel->abort_hwpt_id : accel->bypass_hwpt_id;
}
+static bool
+smmuv3_accel_alloc_vdev(SMMUv3AccelDevice *accel_dev, int sid, Error **errp)
+{
+ SMMUv3AccelState *accel = accel_dev->s_accel;
+ HostIOMMUDeviceIOMMUFD *idev = accel_dev->idev;
+ IOMMUFDVdev *vdev = accel_dev->vdev;
+ uint32_t vdevice_id;
+
+ if (!idev || vdev) {
+ return true;
+ }
+
+ if (!iommufd_backend_alloc_vdev(idev->iommufd, idev->devid,
+ accel->viommu->viommu_id, sid,
+ &vdevice_id, errp)) {
+ return false;
+ }
+
+ vdev = g_new(IOMMUFDVdev, 1);
+ vdev->vdevice_id = vdevice_id;
+ vdev->virt_id = sid;
+ accel_dev->vdev = vdev;
+ return true;
+}
+
+static SMMUS1Hwpt *
+smmuv3_accel_dev_alloc_translate(SMMUv3AccelDevice *accel_dev, STE *ste,
+ Error **errp)
+{
+ uint64_t ste_0 = (uint64_t)ste->word[0] | (uint64_t)ste->word[1] << 32;
+ uint64_t ste_1 = (uint64_t)ste->word[2] | (uint64_t)ste->word[3] << 32;
+ HostIOMMUDeviceIOMMUFD *idev = accel_dev->idev;
+ SMMUv3AccelState *accel = accel_dev->s_accel;
+ struct iommu_hwpt_arm_smmuv3 nested_data = {
+ .ste = {
+ cpu_to_le64(ste_0 & STE0_MASK),
+ cpu_to_le64(ste_1 & STE1_MASK),
+ },
+ };
+ uint32_t hwpt_id = 0, flags = 0;
+ SMMUS1Hwpt *s1_hwpt;
+
+ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+ accel->viommu->viommu_id, flags,
+ IOMMU_HWPT_DATA_ARM_SMMUV3,
+ sizeof(nested_data), &nested_data,
+ &hwpt_id, errp)) {
+ return NULL;
+ }
+
+ s1_hwpt = g_new0(SMMUS1Hwpt, 1);
+ s1_hwpt->hwpt_id = hwpt_id;
+ trace_smmuv3_accel_translate_ste(accel_dev->vdev->virt_id, hwpt_id,
+ nested_data.ste[1], nested_data.ste[0]);
+ return s1_hwpt;
+}
+
+bool smmuv3_accel_install_ste(SMMUv3State *s, SMMUDevice *sdev, int sid,
+ Error **errp)
+{
+ SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid,
+ .inval_ste_allowed = true};
+ SMMUv3AccelState *accel = s->s_accel;
+ SMMUv3AccelDevice *accel_dev;
+ HostIOMMUDeviceIOMMUFD *idev;
+ uint32_t config, hwpt_id = 0;
+ SMMUS1Hwpt *s1_hwpt = NULL;
+ const char *type;
+ STE ste;
+
+ if (!accel || !accel->viommu) {
+ return true;
+ }
+
+ accel_dev = container_of(sdev, SMMUv3AccelDevice, sdev);
+ if (!accel_dev->s_accel) {
+ return true;
+ }
+
+ idev = accel_dev->idev;
+ if (!smmuv3_accel_alloc_vdev(accel_dev, sid, errp)) {
+ return false;
+ }
+
+ if (smmu_find_ste(sdev->smmu, sid, &ste, &event)) {
+ /* No STE found, nothing to install */
+ return true;
+ }
+
+ /*
+ * Install the STE based on SMMU enabled/config:
+ * - attach a pre-allocated HWPT for abort/bypass
+ * - or a new HWPT for translate STE
+ *
+ * Note: The vdev remains associated with accel_dev even if HWPT
+ * attach/alloc fails, since the Guest–Host SID mapping stays
+ * valid as long as the device is behind the accelerated SMMUv3.
+ */
+ if (!smmu_enabled(s)) {
+ hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
+ } else {
+ config = STE_CONFIG(&ste);
+
+ if (!STE_VALID(&ste) || STE_CFG_ABORT(config)) {
+ hwpt_id = accel->abort_hwpt_id;
+ } else if (STE_CFG_BYPASS(config)) {
+ hwpt_id = accel->bypass_hwpt_id;
+ } else if (STE_CFG_S1_TRANSLATE(config)) {
+ s1_hwpt = smmuv3_accel_dev_alloc_translate(accel_dev, &ste, errp);
+ if (!s1_hwpt) {
+ return false;
+ }
+ hwpt_id = s1_hwpt->hwpt_id;
+ }
+ }
+
+ if (!hwpt_id) {
+ error_setg(errp, "Invalid STE config for sid 0x%x",
+ smmu_get_sid(&accel_dev->sdev));
+ return false;
+ }
+
+ if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp)) {
+ if (s1_hwpt) {
+ iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id);
+ g_free(s1_hwpt);
+ }
+ return false;
+ }
+
+ /* Free the previous s1_hwpt */
+ if (accel_dev->s1_hwpt) {
+ iommufd_backend_free_id(idev->iommufd, accel_dev->s1_hwpt->hwpt_id);
+ g_free(accel_dev->s1_hwpt);
+ }
+
+ accel_dev->s1_hwpt = s1_hwpt;
+ if (hwpt_id == accel->abort_hwpt_id) {
+ type = "abort";
+ } else if (hwpt_id == accel->bypass_hwpt_id) {
+ type = "bypass";
+ } else {
+ type = "translate";
+ }
+
+ trace_smmuv3_accel_install_ste(sid, type, hwpt_id);
+ return true;
+}
+
+bool smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
+ Error **errp)
+{
+ SMMUv3AccelState *accel = s->s_accel;
+ SMMUv3AccelDevice *accel_dev;
+ Error *local_err = NULL;
+ bool all_ok = true;
+
+ if (!accel || !accel->viommu) {
+ return true;
+ }
+
+ QLIST_FOREACH(accel_dev, &accel->device_list, next) {
+ uint32_t sid = smmu_get_sid(&accel_dev->sdev);
+
+ if (sid >= range->start && sid <= range->end) {
+ if (!smmuv3_accel_install_ste(s, &accel_dev->sdev,
+ sid, &local_err)) {
+ error_append_hint(&local_err, "Device 0x%x: Failed to install "
+ "STE\n", sid);
+ error_report_err(local_err);
+ local_err = NULL;
+ all_ok = false;
+ }
+ }
+ }
+
+ if (!all_ok) {
+ error_setg(errp, "Failed to install all STEs properly");
+ }
+ return all_ok;
+}
+
static bool
smmuv3_accel_alloc_viommu(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
Error **errp)
@@ -161,6 +343,7 @@ static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
HostIOMMUDeviceIOMMUFD *idev;
SMMUv3AccelDevice *accel_dev;
SMMUv3AccelState *accel;
+ IOMMUFDVdev *vdev;
SMMUDevice *sdev;
if (!sbus) {
@@ -181,6 +364,20 @@ static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
"0x%x", idev->devid);
}
+ if (accel_dev->s1_hwpt) {
+ iommufd_backend_free_id(accel_dev->idev->iommufd,
+ accel_dev->s1_hwpt->hwpt_id);
+ g_free(accel_dev->s1_hwpt);
+ accel_dev->s1_hwpt = NULL;
+ }
+
+ vdev = accel_dev->vdev;
+ if (vdev) {
+ iommufd_backend_free_id(accel->viommu->iommufd, vdev->vdevice_id);
+ g_free(vdev);
+ accel_dev->vdev = NULL;
+ }
+
accel_dev->idev = NULL;
accel_dev->s_accel = NULL;
QLIST_REMOVE(accel_dev, next);
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index efb631db4f..4e20b646dc 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -27,19 +27,41 @@ typedef struct SMMUv3AccelState {
QLIST_HEAD(, SMMUv3AccelDevice) device_list;
} SMMUv3AccelState;
+typedef struct SMMUS1Hwpt {
+ uint32_t hwpt_id;
+} SMMUS1Hwpt;
+
typedef struct SMMUv3AccelDevice {
SMMUDevice sdev;
HostIOMMUDeviceIOMMUFD *idev;
+ SMMUS1Hwpt *s1_hwpt;
+ IOMMUFDVdev *vdev;
QLIST_ENTRY(SMMUv3AccelDevice) next;
SMMUv3AccelState *s_accel;
} SMMUv3AccelDevice;
#ifdef CONFIG_ARM_SMMUV3_ACCEL
void smmuv3_accel_init(SMMUv3State *s);
+bool smmuv3_accel_install_ste(SMMUv3State *s, SMMUDevice *sdev, int sid,
+ Error **errp);
+bool smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
+ Error **errp);
#else
static inline void smmuv3_accel_init(SMMUv3State *s)
{
}
+static inline bool
+smmuv3_accel_install_ste(SMMUv3State *s, SMMUDevice *sdev, int sid,
+ Error **errp)
+{
+ return true;
+}
+static inline bool
+smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
+ Error **errp)
+{
+ return true;
+}
#endif
#endif /* HW_ARM_SMMUV3_ACCEL_H */
diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
index 687ee6aaca..a6464425ec 100644
--- a/hw/arm/smmuv3-internal.h
+++ b/hw/arm/smmuv3-internal.h
@@ -353,6 +353,7 @@ typedef struct SMMUEventInfo {
} while (0)
void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *event);
+int smmu_find_ste(SMMUv3State *s, uint32_t sid, STE *ste, SMMUEventInfo *event);
static inline int oas2bits(int oas_field)
{
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index ade2b43ab8..7e29284267 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -630,8 +630,7 @@ bad_ste:
* Supports linear and 2-level stream table
* Return 0 on success, -EINVAL otherwise
*/
-static int smmu_find_ste(SMMUv3State *s, uint32_t sid, STE *ste,
- SMMUEventInfo *event)
+int smmu_find_ste(SMMUv3State *s, uint32_t sid, STE *ste, SMMUEventInfo *event)
{
dma_addr_t addr, strtab_base;
uint32_t log2size;
@@ -1341,6 +1340,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
}
trace_smmuv3_cmdq_cfgi_ste(sid);
+ if (!smmuv3_accel_install_ste(s, sdev, sid, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
smmuv3_flush_config(sdev);
break;
@@ -1361,6 +1364,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
sid_range.end = sid_range.start + mask;
trace_smmuv3_cmdq_cfgi_ste_range(sid_range.start, sid_range.end);
+ if (!smmuv3_accel_install_ste_range(s, &sid_range, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
smmu_configs_inv_sid_range(bs, sid_range);
break;
}
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index 2aaa0c40c7..8135c0c734 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -69,6 +69,8 @@ smmu_reset_exit(void) ""
#smmuv3-accel.c
smmuv3_accel_set_iommu_device(int devfn, uint32_t devid) "devfn=0x%x (idev devid=0x%x)"
smmuv3_accel_unset_iommu_device(int devfn, uint32_t devid) "devfn=0x%x (idev devid=0x%x)"
+smmuv3_accel_translate_ste(uint32_t vsid, uint32_t hwpt_id, uint64_t ste_1, uint64_t ste_0) "vSID=0x%x hwpt_id=0x%x ste=%"PRIx64":%"PRIx64
+smmuv3_accel_install_ste(uint32_t vsid, const char * type, uint32_t hwpt_id) "vSID=0x%x ste type=%s hwpt_id=0x%x"
# strongarm.c
strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
diff --git a/include/hw/arm/smmuv3-common.h b/include/hw/arm/smmuv3-common.h
index 153310248d..415b7ccde5 100644
--- a/include/hw/arm/smmuv3-common.h
+++ b/include/hw/arm/smmuv3-common.h
@@ -99,10 +99,28 @@ REG32(STE_7, 28)
#define STE_CFG_S2_ENABLED(config) (config & 0x2)
#define STE_CFG_ABORT(config) (!(config & 0x4))
#define STE_CFG_BYPASS(config) (config == 0x4)
+#define STE_CFG_S1_TRANSLATE(config) (config == 0x5)
#define SMMU_STE_VALID (1ULL << 0)
#define SMMU_STE_CFG_BYPASS (1ULL << 3)
+#define STE0_V MAKE_64BIT_MASK(0, 1)
+#define STE0_CONFIG MAKE_64BIT_MASK(1, 3)
+#define STE0_S1FMT MAKE_64BIT_MASK(4, 2)
+#define STE0_CTXPTR MAKE_64BIT_MASK(6, 50)
+#define STE0_S1CDMAX MAKE_64BIT_MASK(59, 5)
+#define STE0_MASK (STE0_S1CDMAX | STE0_CTXPTR | STE0_S1FMT | STE0_CONFIG | \
+ STE0_V)
+
+#define STE1_S1DSS MAKE_64BIT_MASK(0, 2)
+#define STE1_S1CIR MAKE_64BIT_MASK(2, 2)
+#define STE1_S1COR MAKE_64BIT_MASK(4, 2)
+#define STE1_S1CSH MAKE_64BIT_MASK(6, 2)
+#define STE1_S1STALLD MAKE_64BIT_MASK(27, 1)
+#define STE1_EATS MAKE_64BIT_MASK(28, 2)
+#define STE1_MASK (STE1_EATS | STE1_S1STALLD | STE1_S1CSH | STE1_S1COR | \
+ STE1_S1CIR | STE1_S1DSS)
+
/* Update STE fields */
#define STE_SET_VALID(ste, v) \
((ste)->word[0] = FIELD_DP32((ste)->word[0], STE_0, VALID, (v)))
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 15/37] hw/arm/smmuv3-accel: Install SMMUv3 GBPA based hwpt
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (13 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 14/37] hw/arm/smmuv3-accel: Add nested vSTE install/uninstall support Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 16/37] hw/pci/pci: Introduce a callback to retrieve the MSI doorbell GPA directly Shameer Kolothum
` (22 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
On guest reboot or on GBPA update, attach a nested HWPT based on the
GPBA.ABORT bit which either aborts all incoming transactions or bypasses
them.
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 36 ++++++++++++++++++++++++++++++++++++
hw/arm/smmuv3-accel.h | 9 +++++++++
hw/arm/smmuv3.c | 2 ++
3 files changed, 47 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 877b7e0e17..c125974d12 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -499,6 +499,42 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.unset_iommu_device = smmuv3_accel_unset_iommu_device,
};
+/* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
+bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp)
+{
+ SMMUv3AccelState *accel = s->s_accel;
+ SMMUv3AccelDevice *accel_dev;
+ Error *local_err = NULL;
+ bool all_ok = true;
+ uint32_t hwpt_id;
+
+ if (!accel || !accel->viommu) {
+ return true;
+ }
+
+ hwpt_id = smmuv3_accel_gbpa_hwpt(s, accel);
+ QLIST_FOREACH(accel_dev, &accel->device_list, next) {
+ if (!host_iommu_device_iommufd_attach_hwpt(accel_dev->idev, hwpt_id,
+ &local_err)) {
+ error_append_hint(&local_err, "Failed to attach GBPA hwpt %u for "
+ "idev devid %u", hwpt_id, accel_dev->idev->devid);
+ error_report_err(local_err);
+ local_err = NULL;
+ all_ok = false;
+ }
+ }
+ if (!all_ok) {
+ error_setg(errp, "Failed to attach all GBPA based HWPTs properly");
+ }
+ return all_ok;
+}
+
+void smmuv3_accel_reset(SMMUv3State *s)
+{
+ /* Attach a HWPT based on GBPA reset value */
+ smmuv3_accel_attach_gbpa_hwpt(s, NULL);
+}
+
static void smmuv3_accel_as_init(SMMUv3State *s)
{
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index 4e20b646dc..c7ed4dce3a 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -46,6 +46,8 @@ bool smmuv3_accel_install_ste(SMMUv3State *s, SMMUDevice *sdev, int sid,
Error **errp);
bool smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
Error **errp);
+bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp);
+void smmuv3_accel_reset(SMMUv3State *s);
#else
static inline void smmuv3_accel_init(SMMUv3State *s)
{
@@ -62,6 +64,13 @@ smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
{
return true;
}
+static inline bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp)
+{
+ return true;
+}
+static inline void smmuv3_accel_reset(SMMUv3State *s)
+{
+}
#endif
#endif /* HW_ARM_SMMUV3_ACCEL_H */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 7e29284267..7a32afd800 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1600,6 +1600,7 @@ static MemTxResult smmu_writel(SMMUv3State *s, hwaddr offset,
if (data & R_GBPA_UPDATE_MASK) {
/* Ignore update bit as write is synchronous. */
s->gbpa = data & ~R_GBPA_UPDATE_MASK;
+ smmuv3_accel_attach_gbpa_hwpt(s, &local_err);
}
break;
case A_STRTAB_BASE: /* 64b */
@@ -1887,6 +1888,7 @@ static void smmu_reset_exit(Object *obj, ResetType type)
}
smmuv3_init_regs(s);
+ smmuv3_accel_reset(s);
}
static void smmu_realize(DeviceState *d, Error **errp)
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 16/37] hw/pci/pci: Introduce a callback to retrieve the MSI doorbell GPA directly
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (14 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 15/37] hw/arm/smmuv3-accel: Install SMMUv3 GBPA based hwpt Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 17/37] hw/arm/smmuv3-accel: Implement get_msi_direct_gpa callback Shameer Kolothum
` (21 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Michael S . Tsirkin
For certain vIOMMU implementations, such as SMMUv3 in accelerated mode,
the translation tables are programmed directly into the physical SMMUv3
in a nested configuration. While QEMU knows where the guest tables live,
safely walking them in software would require trapping and ordering all
guest invalidations on every command queue. Without this, QEMU could race
with guest updates and walk stale or freed page tables.
This constraint is fundamental to the design of HW-accelerated vSMMU when
used with downstream vfio-pci endpoint devices, where QEMU must never walk
guest translation tables and must rely on the physical SMMU for
translation. Future accelerated vSMMU features, such as virtual CMDQ, will
also prevent trapping invalidations, reinforcing this restriction.
For vfio-pci endpoints behind such a vSMMU, the only translation QEMU
needs is for the MSI doorbell used when setting up KVM MSI route tables.
Instead of attempting a software walk, introduce an optional vIOMMU
callback that returns the MSI doorbell GPA directly.
kvm_arch_fixup_msi_route() uses this callback when available and ignores
the guest provided IOVA in that case.
If the vIOMMU does not implement the callback, we fall back to the
existing IOMMU based address space translation path.
This ensures correct MSI routing for accelerated SMMUv3 + VFIO passthrough
while avoiding unsafe software walks of guest translation tables.
As a related change, replace RCU_READ_LOCK_GUARD() with explicit
rcu_read_lock()/rcu_read_unlock(). The introduction of an early goto
(set_doorbell) path means the RCU read side critical section can no longer
be safely scoped using RCU_READ_LOCK_GUARD().
Cc: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci/pci.c | 17 +++++++++++++++++
include/hw/pci/pci.h | 17 +++++++++++++++++
target/arm/kvm.c | 18 +++++++++++++++++-
3 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 101e745bd5..9035caca92 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2979,6 +2979,23 @@ bool pci_device_get_iommu_bus_devfn(PCIDevice *dev, PCIBus **piommu_bus,
return aliased;
}
+bool pci_device_iommu_msi_direct_gpa(PCIDevice *dev, hwaddr *out_doorbell)
+{
+ PCIBus *bus;
+ PCIBus *iommu_bus;
+ int devfn;
+
+ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+ if (iommu_bus) {
+ if (iommu_bus->iommu_ops->get_msi_direct_gpa) {
+ *out_doorbell = iommu_bus->iommu_ops->get_msi_direct_gpa(bus,
+ iommu_bus->iommu_opaque, devfn);
+ return true;
+ }
+ }
+ return false;
+}
+
AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
{
PCIBus *bus;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index ddb0c98e9f..d9835dfd0d 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -683,6 +683,22 @@ typedef struct PCIIOMMUOps {
uint32_t pasid, bool priv_req, bool exec_req,
hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
bool is_write);
+ /**
+ * @get_msi_direct_gpa: get the guest physical address of MSI doorbell
+ * for the device on a PCI bus.
+ *
+ * Optional callback. If implemented, it must return a valid guest
+ * physical address for the MSI doorbell
+ *
+ * @bus: the #PCIBus being accessed.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number
+ *
+ * Returns: the guest physical address of the MSI doorbell.
+ */
+ uint64_t (*get_msi_direct_gpa)(PCIBus *bus, void *opaque, int devfn);
} PCIIOMMUOps;
bool pci_device_get_iommu_bus_devfn(PCIDevice *dev, PCIBus **piommu_bus,
@@ -691,6 +707,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
Error **errp);
void pci_device_unset_iommu_device(PCIDevice *dev);
+bool pci_device_iommu_msi_direct_gpa(PCIDevice *dev, hwaddr *out_doorbell);
/**
* pci_device_get_viommu_flags: get vIOMMU flags.
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 48f853fff8..0828e8b87b 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -1621,26 +1621,42 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
return 0;
}
+ /*
+ * We do have an IOMMU address space, but for some vIOMMU implementations
+ * (e.g. accelerated SMMUv3) the translation tables are programmed into
+ * the physical SMMUv3 in the host (nested S1=guest, S2=host). QEMU cannot
+ * walk these tables in a safe way, so in that case we obtain the MSI
+ * doorbell GPA directly from the vIOMMU backend and ignore the gIOVA
+ * @address.
+ */
+ if (pci_device_iommu_msi_direct_gpa(dev, &doorbell_gpa)) {
+ goto set_doorbell;
+ }
+
/* MSI doorbell address is translated by an IOMMU */
- RCU_READ_LOCK_GUARD();
+ rcu_read_lock();
mr = address_space_translate(as, address, &xlat, &len, true,
MEMTXATTRS_UNSPECIFIED);
if (!mr) {
+ rcu_read_unlock();
return 1;
}
mrs = memory_region_find(mr, xlat, 1);
if (!mrs.mr) {
+ rcu_read_unlock();
return 1;
}
doorbell_gpa = mrs.offset_within_address_space;
memory_region_unref(mrs.mr);
+ rcu_read_unlock();
+set_doorbell:
route->u.msi.address_lo = doorbell_gpa;
route->u.msi.address_hi = doorbell_gpa >> 32;
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 17/37] hw/arm/smmuv3-accel: Implement get_msi_direct_gpa callback
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (15 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 16/37] hw/pci/pci: Introduce a callback to retrieve the MSI doorbell GPA directly Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 18/37] hw/arm/virt: Set msi-gpa property Shameer Kolothum
` (20 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Accelerated SMMUv3 instances rely on the physical SMMUv3 for nested
translation (guest Stage-1, host Stage-2). In this mode, the guest Stage-1
tables are programmed directly into hardware, and QEMU must not attempt to
walk them for translation, as doing so is not reliably safe. For vfio-pci
endpoints behind such a vSMMU, the only translation QEMU needs to perform
is for the MSI doorbell used during KVM MSI setup.
Implement the callback so that kvm_arch_fixup_msi_route() can retrieve the
MSI doorbell GPA directly, instead of attempting a software walk of the
guest translation tables.
Also introduce an SMMUv3 device property to carry the MSI doorbell GPA.
This property will be set by the virt machine in a subsequent patch.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 10 ++++++++++
hw/arm/smmuv3.c | 2 ++
include/hw/arm/smmuv3.h | 1 +
3 files changed, 13 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index c125974d12..c6ee123cdf 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -393,6 +393,15 @@ static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
}
}
+static uint64_t smmuv3_accel_get_msi_gpa(PCIBus *bus, void *opaque, int devfn)
+{
+ SMMUState *bs = opaque;
+ SMMUv3State *s = ARM_SMMUV3(bs);
+
+ g_assert(s->msi_gpa);
+ return s->msi_gpa;
+}
+
/*
* Only allow PCIe bridges, pxb-pcie roots, and GPEX roots so vfio-pci
* endpoints can sit downstream. Accelerated SMMUv3 requires a vfio-pci
@@ -497,6 +506,7 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.get_viommu_flags = smmuv3_accel_get_viommu_flags,
.set_iommu_device = smmuv3_accel_set_iommu_device,
.unset_iommu_device = smmuv3_accel_unset_iommu_device,
+ .get_msi_direct_gpa = smmuv3_accel_get_msi_gpa,
};
/* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 7a32afd800..6ed9914b1e 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1998,6 +1998,8 @@ static const Property smmuv3_properties[] = {
* Defaults to stage 1
*/
DEFINE_PROP_STRING("stage", SMMUv3State, stage),
+ /* GPA of MSI doorbell, for SMMUv3 accel use. */
+ DEFINE_PROP_UINT64("msi-gpa", SMMUv3State, msi_gpa, 0),
};
static void smmuv3_instance_init(Object *obj)
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index e54ece2d38..5616a8a2be 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -67,6 +67,7 @@ struct SMMUv3State {
/* SMMU has HW accelerator support for nested S1 + s2 */
bool accel;
struct SMMUv3AccelState *s_accel;
+ uint64_t msi_gpa;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 18/37] hw/arm/virt: Set msi-gpa property
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (16 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 17/37] hw/arm/smmuv3-accel: Implement get_msi_direct_gpa callback Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 19/37] hw/arm/smmuv3-accel: Add support to issue invalidation cmd to host Shameer Kolothum
` (19 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Set the MSI doorbell GPA property for accelerated SMMUv3 devices for use
by KVM MSI setup. Also, since any meaningful use of vfio-pci devices with
an accelerated SMMUv3 requires both KVM and a kernel irqchip, ensure
those are specified when accel=on is selected.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/virt.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 4badc1a734..91fec582ed 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3061,6 +3061,26 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
object_property_set_link(OBJECT(dev), "secure-memory",
OBJECT(vms->secure_sysmem), NULL);
}
+ if (object_property_find(OBJECT(dev), "accel") &&
+ object_property_get_bool(OBJECT(dev), "accel", &error_abort)) {
+ hwaddr db_start = 0;
+
+ if (!kvm_enabled() || !kvm_irqchip_in_kernel()) {
+ error_setg(errp, "SMMUv3 accel=on requires KVM with "
+ "kernel-irqchip=on support");
+ return;
+ }
+
+ if (vms->msi_controller == VIRT_MSI_CTRL_ITS) {
+ /* GITS_TRANSLATER page + offset */
+ db_start = base_memmap[VIRT_GIC_ITS].base + 0x10000 + 0x40;
+ } else if (vms->msi_controller == VIRT_MSI_CTRL_GICV2M) {
+ /* MSI_SETSPI_NS page + offset */
+ db_start = base_memmap[VIRT_GIC_V2M].base + 0x40;
+ }
+ object_property_set_uint(OBJECT(dev), "msi-gpa", db_start,
+ &error_abort);
+ }
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 19/37] hw/arm/smmuv3-accel: Add support to issue invalidation cmd to host
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (17 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 18/37] hw/arm/virt: Set msi-gpa property Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 20/37] hw/arm/smmuv3: Initialize ID registers early during realize() Shameer Kolothum
` (18 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Provide a helper and use that to issue the invalidation cmd to host SMMUv3.
We only issue one cmd at a time for now.
Support for batching of commands will be added later after analysing the
impact.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 36 ++++++++++++++++++++++++++++++++++++
hw/arm/smmuv3-accel.h | 8 ++++++++
hw/arm/smmuv3.c | 16 ++++++++++++++++
3 files changed, 60 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index c6ee123cdf..89dc6f991c 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -233,6 +233,42 @@ bool smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
return all_ok;
}
+/*
+ * This issues the invalidation cmd to the host SMMUv3.
+ *
+ * sdev is non-NULL for SID based invalidations (e.g. CFGI_CD), and NULL for
+ * non SID invalidations such as SMMU_CMD_TLBI_NH_ASID and SMMU_CMD_TLBI_NH_VA.
+ */
+bool smmuv3_accel_issue_inv_cmd(SMMUv3State *bs, void *cmd, SMMUDevice *sdev,
+ Error **errp)
+{
+ SMMUv3State *s = ARM_SMMUV3(bs);
+ SMMUv3AccelState *accel = s->s_accel;
+ uint32_t entry_num = 1;
+
+ /*
+ * No accel or viommu means no VFIO/IOMMUFD devices, nothing to
+ * invalidate.
+ */
+ if (!accel || !accel->viommu) {
+ return true;
+ }
+
+ /*
+ * SID based invalidations (e.g. CFGI_CD) apply only to vfio-pci endpoints
+ * with a valid vIOMMU vdev.
+ */
+ if (sdev && !container_of(sdev, SMMUv3AccelDevice, sdev)->vdev) {
+ return true;
+ }
+
+ /* Single command (entry_num = 1); no need to check returned entry_num */
+ return iommufd_backend_invalidate_cache(
+ accel->viommu->iommufd, accel->viommu->viommu_id,
+ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3,
+ sizeof(Cmd), &entry_num, cmd, errp);
+}
+
static bool
smmuv3_accel_alloc_viommu(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
Error **errp)
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index c7ed4dce3a..41b37e3122 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -47,6 +47,8 @@ bool smmuv3_accel_install_ste(SMMUv3State *s, SMMUDevice *sdev, int sid,
bool smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
Error **errp);
bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp);
+bool smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
+ Error **errp);
void smmuv3_accel_reset(SMMUv3State *s);
#else
static inline void smmuv3_accel_init(SMMUv3State *s)
@@ -68,6 +70,12 @@ static inline bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp)
{
return true;
}
+static inline bool
+smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
+ Error **errp)
+{
+ return true;
+}
static inline void smmuv3_accel_reset(SMMUv3State *s)
{
}
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 6ed9914b1e..4efef73373 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1388,6 +1388,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
trace_smmuv3_cmdq_cfgi_cd(sid);
smmuv3_flush_config(sdev);
+ if (!smmuv3_accel_issue_inv_cmd(s, &cmd, sdev, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
break;
}
case SMMU_CMD_TLBI_NH_ASID:
@@ -1411,6 +1415,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
trace_smmuv3_cmdq_tlbi_nh_asid(asid);
smmu_inv_notifiers_all(&s->smmu_state);
smmu_iotlb_inv_asid_vmid(bs, asid, vmid);
+ if (!smmuv3_accel_issue_inv_cmd(s, &cmd, NULL, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
break;
}
case SMMU_CMD_TLBI_NH_ALL:
@@ -1438,6 +1446,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
trace_smmuv3_cmdq_tlbi_nsnh();
smmu_inv_notifiers_all(&s->smmu_state);
smmu_iotlb_inv_all(bs);
+ if (!smmuv3_accel_issue_inv_cmd(s, &cmd, NULL, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
break;
case SMMU_CMD_TLBI_NH_VAA:
case SMMU_CMD_TLBI_NH_VA:
@@ -1446,6 +1458,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
break;
}
smmuv3_range_inval(bs, &cmd, SMMU_STAGE_1);
+ if (!smmuv3_accel_issue_inv_cmd(s, &cmd, NULL, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
break;
case SMMU_CMD_TLBI_S12_VMALL:
{
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 20/37] hw/arm/smmuv3: Initialize ID registers early during realize()
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (18 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 19/37] hw/arm/smmuv3-accel: Add support to issue invalidation cmd to host Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 21/37] hw/arm/smmuv3-accel: Get host SMMUv3 hw info and validate Shameer Kolothum
` (17 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Factor out ID register init into smmuv3_init_id_regs() and call it from
realize(). This ensures ID registers are initialized early for use in the
accelerated SMMUv3 path and will be utilized in subsequent patch.
Other registers remain initialized in smmuv3_reset().
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 4efef73373..e301bb467d 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -258,7 +258,12 @@ void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *info)
info->recorded = true;
}
-static void smmuv3_init_regs(SMMUv3State *s)
+/*
+ * Called during realize(), as the ID registers will be accessed early in the
+ * SMMUv3 accel path for feature compatibility checks. The remaining registers
+ * are initialized later in smmuv3_reset().
+ */
+static void smmuv3_init_id_regs(SMMUv3State *s)
{
/* Based on sys property, the stages supported in smmu will be advertised.*/
if (s->stage && !strcmp("2", s->stage)) {
@@ -298,7 +303,11 @@ static void smmuv3_init_regs(SMMUv3State *s)
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1);
+ s->aidr = 0x1;
+}
+static void smmuv3_reset(SMMUv3State *s)
+{
s->cmdq.base = deposit64(s->cmdq.base, 0, 5, SMMU_CMDQS);
s->cmdq.prod = 0;
s->cmdq.cons = 0;
@@ -310,7 +319,6 @@ static void smmuv3_init_regs(SMMUv3State *s)
s->features = 0;
s->sid_split = 0;
- s->aidr = 0x1;
s->cr[0] = 0;
s->cr0ack = 0;
s->irq_ctrl = 0;
@@ -1903,7 +1911,7 @@ static void smmu_reset_exit(Object *obj, ResetType type)
c->parent_phases.exit(obj, type);
}
- smmuv3_init_regs(s);
+ smmuv3_reset(s);
smmuv3_accel_reset(s);
}
@@ -1935,6 +1943,7 @@ static void smmu_realize(DeviceState *d, Error **errp)
sysbus_init_mmio(dev, &sys->iomem);
smmu_init_irq(s, dev);
+ smmuv3_init_id_regs(s);
}
static const VMStateDescription vmstate_smmuv3_queue = {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 21/37] hw/arm/smmuv3-accel: Get host SMMUv3 hw info and validate
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (19 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 20/37] hw/arm/smmuv3: Initialize ID registers early during realize() Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 22/37] hw/pci-host/gpex: Allow to generate preserve boot config DSM #5 Shameer Kolothum
` (16 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Just before the device gets attached to the SMMUv3, make sure QEMU SMMUv3
features are compatible with the host SMMUv3.
Not all fields in the host SMMUv3 IDR registers are meaningful for userspace.
Only the following fields can be used:
- IDR0: ST_LEVEL, TERM_MODEL, STALL_MODEL, TTENDIAN, CD2L, ASID16, TTF
- IDR1: SIDSIZE, SSIDSIZE
- IDR3: BBML, RIL
- IDR5: VAX, GRAN64K, GRAN16K, GRAN4K
For now, the check is to make sure the features are in sync to enable
basic accelerated SMMUv3 support. AIDR is not checked, as hardware
implementations often provide a mix of architecture features regardless
of the revision reported in AIDR.
Note that SSIDSIZE check will be added later when support for PASID is
introduced.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 101 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 101 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 89dc6f991c..33011962e3 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -27,6 +27,99 @@
static MemoryRegion root, sysmem;
static AddressSpace *shared_as_sysmem;
+static bool
+smmuv3_accel_check_hw_compatible(SMMUv3State *s,
+ struct iommu_hw_info_arm_smmuv3 *info,
+ Error **errp)
+{
+ /* QEMU SMMUv3 supports both linear and 2-level stream tables */
+ if (FIELD_EX32(info->idr[0], IDR0, STLEVEL) !=
+ FIELD_EX32(s->idr[0], IDR0, STLEVEL)) {
+ error_setg(errp, "Host SMMUv3 Stream Table format mismatch "
+ "(host STLEVEL=%u, QEMU STLEVEL=%u)",
+ FIELD_EX32(info->idr[0], IDR0, STLEVEL),
+ FIELD_EX32(s->idr[0], IDR0, STLEVEL));
+ return false;
+ }
+
+ /* QEMU SMMUv3 supports only little-endian translation table walks */
+ if (FIELD_EX32(info->idr[0], IDR0, TTENDIAN) >
+ FIELD_EX32(s->idr[0], IDR0, TTENDIAN)) {
+ error_setg(errp, "Host SMMUv3 doesn't support Little-endian "
+ "translation table");
+ return false;
+ }
+
+ /* QEMU SMMUv3 supports only AArch64 translation table format */
+ if (FIELD_EX32(info->idr[0], IDR0, TTF) <
+ FIELD_EX32(s->idr[0], IDR0, TTF)) {
+ error_setg(errp, "Host SMMUv3 doesn't support AArch64 translation "
+ "table format");
+ return false;
+ }
+
+ /* QEMU SMMUv3 supports SIDSIZE 16 */
+ if (FIELD_EX32(info->idr[1], IDR1, SIDSIZE) <
+ FIELD_EX32(s->idr[1], IDR1, SIDSIZE)) {
+ error_setg(errp, "Host SMMUv3 SIDSIZE not compatible "
+ "(host=%u, QEMU=%u)",
+ FIELD_EX32(info->idr[1], IDR1, SIDSIZE),
+ FIELD_EX32(s->idr[1], IDR1, SIDSIZE));
+ return false;
+ }
+
+ /* QEMU SMMUv3 supports Range Invalidation by default */
+ if (FIELD_EX32(info->idr[3], IDR3, RIL) !=
+ FIELD_EX32(s->idr[3], IDR3, RIL)) {
+ error_setg(errp, "Host SMMUv3 doesn't support Range Invalidation");
+ return false;
+ }
+
+ /* QEMU SMMUv3 supports GRAN4K/GRAN16K/GRAN64K translation granules */
+ if (FIELD_EX32(info->idr[5], IDR5, GRAN4K) !=
+ FIELD_EX32(s->idr[5], IDR5, GRAN4K)) {
+ error_setg(errp, "Host SMMUv3 doesn't support 4K translation granule");
+ return false;
+ }
+ if (FIELD_EX32(info->idr[5], IDR5, GRAN16K) !=
+ FIELD_EX32(s->idr[5], IDR5, GRAN16K)) {
+ error_setg(errp, "Host SMMUv3 doesn't support 16K translation granule");
+ return false;
+ }
+ if (FIELD_EX32(info->idr[5], IDR5, GRAN64K) !=
+ FIELD_EX32(s->idr[5], IDR5, GRAN64K)) {
+ error_setg(errp, "Host SMMUv3 doesn't support 64K translation granule");
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+smmuv3_accel_hw_compatible(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
+ Error **errp)
+{
+ struct iommu_hw_info_arm_smmuv3 info;
+ uint32_t data_type;
+ uint64_t caps;
+
+ if (!iommufd_backend_get_device_info(idev->iommufd, idev->devid, &data_type,
+ &info, sizeof(info), &caps, errp)) {
+ return false;
+ }
+
+ if (data_type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) {
+ error_setg(errp, "Wrong data type (%d) for Host SMMUv3 device info",
+ data_type);
+ return false;
+ }
+
+ if (!smmuv3_accel_check_hw_compatible(s, &info, errp)) {
+ return false;
+ }
+ return true;
+}
+
static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
PCIBus *bus, int devfn)
{
@@ -353,6 +446,14 @@ static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
return true;
}
+ /*
+ * Check the host SMMUv3 associated with the dev is compatible with the
+ * QEMU SMMUv3 accel.
+ */
+ if (!smmuv3_accel_hw_compatible(s, idev, errp)) {
+ return false;
+ }
+
if (s->s_accel->viommu) {
goto done;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 22/37] hw/pci-host/gpex: Allow to generate preserve boot config DSM #5
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (20 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 21/37] hw/arm/smmuv3-accel: Get host SMMUv3 hw info and validate Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 23/37] hw/arm/virt: Set PCI preserve_config for accel SMMUv3 Shameer Kolothum
` (15 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Michael S . Tsirkin
From: Eric Auger <eric.auger@redhat.com>
Add a 'preserve_config' field in struct GPEXConfig and, if set, generate
the _DSM function #5 for preserving PCI boot configurations.
This will be used for SMMUv3 accel=on support in subsequent patch. When
SMMUv3 acceleration (accel=on) is enabled, QEMU exposes IORT Reserved
Memory Region (RMR) nodes to support MSI doorbell translations. As per
the Arm IORT specification, using IORT RMRs mandates the presence of
_DSM function #5 so that the OS retains the firmware-assigned PCI
configuration. Hence, this patch adds conditional support for generating
_DSM #5.
According to the ACPI Specification, Revision 6.6, Section 9.1.1 -
“_DSM (Device Specific Method)”,
"
If Function Index is zero, the return is a buffer containing one bit for
each function index, starting with zero. Bit 0 indicates whether there
is support for any functions other than function 0 for the specified
UUID and Revision ID. If set to zero, no functions are supported (other
than function zero) for the specified UUID and Revision ID. If set to
one, at least one additional function is supported. For all other bits
in the buffer, a bit is set to zero to indicate if that function index
is not supported for the specific UUID and Revision ID. (For example,
bit 1 set to 0 indicates that function index 1 is not supported for the
specific UUID and Revision ID.)
"
Please refer PCI Firmware Specification, Revision 3.3, Section 4.6.5 —
"_DSM for Preserving PCI Boot Configurations" for Function 5 of _DSM
method.
Also, while at it, move the byte_list declaration to the top of the
function for clarity.
At the moment, DSM generation is not yet enabled.
The resulting AML when preserve_config=true is:
Method (_DSM, 4, NotSerialized)
{
If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d")))
{
If ((Arg2 == Zero))
{
Return (Buffer (One)
{
0x21
})
}
If ((Arg2 == 0x05))
{
Return (Zero)
}
}
...
}
Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
[Shameer: Removed possible duplicate _DSM creations]
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci-host/gpex-acpi.c | 29 +++++++++++++++++++++++------
include/hw/pci-host/gpex.h | 1 +
2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c
index 4587baeb78..d9820f9b41 100644
--- a/hw/pci-host/gpex-acpi.c
+++ b/hw/pci-host/gpex-acpi.c
@@ -51,10 +51,11 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq,
}
}
-static Aml *build_pci_host_bridge_dsm_method(void)
+static Aml *build_pci_host_bridge_dsm_method(bool preserve_config)
{
Aml *method = aml_method("_DSM", 4, AML_NOTSERIALIZED);
Aml *UUID, *ifctx, *ifctx1, *buf;
+ uint8_t byte_list[1] = {0};
/* PCI Firmware Specification 3.0
* 4.6.1. _DSM for PCI Express Slot Information
@@ -64,10 +65,23 @@ static Aml *build_pci_host_bridge_dsm_method(void)
UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
ifctx = aml_if(aml_equal(aml_arg(0), UUID));
ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0)));
- uint8_t byte_list[1] = {0};
+ if (preserve_config) {
+ /* support functions other than 0, specifically function 5 */
+ byte_list[0] = 0x21;
+ }
buf = aml_buffer(1, byte_list);
aml_append(ifctx1, aml_return(buf));
aml_append(ifctx, ifctx1);
+ if (preserve_config) {
+ Aml *ifctx2 = aml_if(aml_equal(aml_arg(2), aml_int(5)));
+ /*
+ * 0 - The operating system must not ignore the PCI configuration that
+ * firmware has done at boot time.
+ */
+ aml_append(ifctx2, aml_return(aml_int(0)));
+ aml_append(ifctx, ifctx2);
+ }
+
aml_append(method, ifctx);
byte_list[0] = 0;
@@ -77,12 +91,13 @@ static Aml *build_pci_host_bridge_dsm_method(void)
}
static void acpi_dsdt_add_host_bridge_methods(Aml *dev,
- bool enable_native_pcie_hotplug)
+ bool enable_native_pcie_hotplug,
+ bool preserve_config)
{
/* Declare an _OSC (OS Control Handoff) method */
aml_append(dev,
build_pci_host_bridge_osc_method(enable_native_pcie_hotplug));
- aml_append(dev, build_pci_host_bridge_dsm_method());
+ aml_append(dev, build_pci_host_bridge_dsm_method(preserve_config));
}
void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg)
@@ -152,7 +167,8 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg)
build_cxl_osc_method(dev);
} else {
/* pxb bridges do not have ACPI PCI Hot-plug enabled */
- acpi_dsdt_add_host_bridge_methods(dev, true);
+ acpi_dsdt_add_host_bridge_methods(dev, true,
+ cfg->preserve_config);
}
aml_append(scope, dev);
@@ -227,7 +243,8 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg)
}
aml_append(dev, aml_name_decl("_CRS", rbuf));
- acpi_dsdt_add_host_bridge_methods(dev, cfg->pci_native_hotplug);
+ acpi_dsdt_add_host_bridge_methods(dev, cfg->pci_native_hotplug,
+ cfg->preserve_config);
Aml *dev_res0 = aml_device("%s", "RES0");
aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02")));
diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h
index 695886dedd..1da9c85bce 100644
--- a/include/hw/pci-host/gpex.h
+++ b/include/hw/pci-host/gpex.h
@@ -46,6 +46,7 @@ struct GPEXConfig {
int irq;
PCIBus *bus;
bool pci_native_hotplug;
+ bool preserve_config;
};
typedef struct GPEXIrq GPEXIrq;
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 23/37] hw/arm/virt: Set PCI preserve_config for accel SMMUv3
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (21 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 22/37] hw/pci-host/gpex: Allow to generate preserve boot config DSM #5 Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 24/37] tests/qtest/bios-tables-test: Prepare for IORT revison upgrade Shameer Kolothum
` (14 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Introduce a new pci_preserve_config field in virt machine state which
allows the generation of DSM #5. This field is only set if accel SMMU
is instantiated.
In a subsequent patch, SMMUv3 accel mode will make use of IORT RMR nodes
to enable nested translation of MSI doorbell addresses. IORT RMR requires
_DSM #5 to be set for the PCI host bridge so that the Guest kernel
preserves the PCI boot configuration.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/virt-acpi-build.c | 8 ++++++++
hw/arm/virt.c | 1 +
include/hw/arm/virt.h | 1 +
3 files changed, 10 insertions(+)
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 03b4342574..9032a5df1c 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -164,6 +164,14 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap,
.pci_native_hotplug = !acpi_pcihp,
};
+ /*
+ * Accel SMMU requires RMRs for MSI 1-1 mapping, which require _DSM
+ * function 5 (_DSM for Preserving PCI Boot Configurations).
+ */
+ if (vms->pci_preserve_config) {
+ cfg.preserve_config = true;
+ }
+
if (vms->highmem_mmio) {
cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO];
}
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 91fec582ed..899b02e1f7 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3080,6 +3080,7 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
}
object_property_set_uint(OBJECT(dev), "msi-gpa", db_start,
&error_abort);
+ vms->pci_preserve_config = true;
}
}
}
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 5907d41dbb..3b382bdf49 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -182,6 +182,7 @@ struct VirtMachineState {
bool legacy_smmuv3_present;
MemoryRegion *sysmem;
MemoryRegion *secure_sysmem;
+ bool pci_preserve_config;
};
#define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 24/37] tests/qtest/bios-tables-test: Prepare for IORT revison upgrade
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (22 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 23/37] hw/arm/virt: Set PCI preserve_config for accel SMMUv3 Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 25/37] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding Shameer Kolothum
` (13 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Subsequent patch will upgrade IORT revision to 5 to add support
for IORT RMR nodes.
Add the affected IORT blobs to allowed-diff list for bios-table
tests.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
tests/qtest/bios-tables-test-allowed-diff.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..3279638ad0 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,5 @@
/* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/aarch64/virt/IORT",
+"tests/data/acpi/aarch64/virt/IORT.its_off",
+"tests/data/acpi/aarch64/virt/IORT.smmuv3-legacy",
+"tests/data/acpi/aarch64/virt/IORT.smmuv3-dev",
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 25/37] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (23 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 24/37] tests/qtest/bios-tables-test: Prepare for IORT revison upgrade Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 26/37] tests/qtest/bios-tables-test: Update IORT blobs after revision upgrade Shameer Kolothum
` (12 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Jean-Philippe Brucker
From: Eric Auger <eric.auger@redhat.com>
To handle SMMUv3 accel=on mode(which configures the host SMMUv3 in nested
mode), it is practical to expose the guest with reserved memory regions
(RMRs) covering the IOVAs used by the host kernel to map physical MSI
doorbells.
Those IOVAs belong to [0x8000000, 0x8100000] matching MSI_IOVA_BASE and
MSI_IOVA_LENGTH definitions in kernel arm-smmu-v3 driver. This is the
window used to allocate IOVAs matching physical MSI doorbells.
With those RMRs, the guest is forced to use a flat mapping for this range.
Hence the assigned device is programmed with one IOVA from this range.
Stage 1, owned by the guest has a flat mapping for this IOVA. Stage2,
owned by the VMM then enforces a mapping from this IOVA to the physical
MSI doorbell.
The creation of those RMR nodes is only relevant if nested stage SMMU is
in use, along with VFIO. As VFIO devices can be hotplugged, all RMRs need
to be created in advance.
Initialise AcpiIortSMMUv3Dev structures to avoid using uninitialised
state when building the IORT, as legacy and device SMMUv3 paths
populate different fields now(e.g. accel).
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Suggested-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/virt-acpi-build.c | 115 +++++++++++++++++++++++++++++++++++----
1 file changed, 105 insertions(+), 10 deletions(-)
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 9032a5df1c..4ae4cbc6cd 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -257,6 +257,29 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms)
#define ROOT_COMPLEX_ENTRY_SIZE 36
#define IORT_NODE_OFFSET 48
+#define IORT_RMR_NUM_ID_MAPPINGS 1
+#define IORT_RMR_NUM_MEM_RANGE_DESC 1
+#define IORT_RMR_COMMON_HEADER_SIZE 28
+#define IORT_RMR_MEM_RANGE_DESC_SIZE 20
+
+/*
+ * IORT RMR flags:
+ * Bit[0] = 0 Disallow remapping of reserved ranges
+ * Bit[1] = 0 Unprivileged access
+ * Bits[9:2] = 0x00 Device nGnRnE memory
+ */
+#define IORT_RMR_FLAGS 0
+
+/*
+ * MSI doorbell IOVA window used by the host kernel SMMUv3 driver.
+ * Described in IORT RMR nodes to reserve the IOVA range where the host
+ * kernel maps physical MSI doorbells for devices. This ensures guests
+ * preserve a flat mapping for MSI doorbell in nested SMMUv3(accel=on)
+ * configurations.
+ */
+#define MSI_IOVA_BASE 0x8000000
+#define MSI_IOVA_LENGTH 0x100000
+
/*
* Append an ID mapping entry as described by "Table 4 ID mapping format" in
* "IO Remapping Table System Software on ARM Platforms", Chapter 3.
@@ -265,7 +288,8 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms)
* Note that @id_count gets internally subtracted by one, following the spec.
*/
static void build_iort_id_mapping(GArray *table_data, uint32_t input_base,
- uint32_t id_count, uint32_t out_ref)
+ uint32_t id_count, uint32_t out_ref,
+ uint32_t flags)
{
build_append_int_noprefix(table_data, input_base, 4); /* Input base */
/* Number of IDs - The number of IDs in the range minus one */
@@ -273,7 +297,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base,
build_append_int_noprefix(table_data, input_base, 4); /* Output base */
build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */
/* Flags */
- build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4);
+ build_append_int_noprefix(table_data, flags, 4);
}
struct AcpiIortIdMapping {
@@ -321,6 +345,7 @@ typedef struct AcpiIortSMMUv3Dev {
GArray *rc_smmu_idmaps;
/* Offset of the SMMUv3 IORT Node relative to the start of the IORT */
size_t offset;
+ bool accel;
} AcpiIortSMMUv3Dev;
/*
@@ -330,7 +355,7 @@ typedef struct AcpiIortSMMUv3Dev {
static int populate_smmuv3_legacy_dev(GArray *sdev_blob)
{
VirtMachineState *vms = VIRT_MACHINE(qdev_get_machine());
- AcpiIortSMMUv3Dev sdev;
+ AcpiIortSMMUv3Dev sdev = {0};
sdev.rc_smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
object_child_foreach_recursive(object_get_root(), iort_host_bridges,
@@ -362,10 +387,10 @@ static int smmuv3_dev_idmap_compare(gconstpointer a, gconstpointer b)
static int iort_smmuv3_devices(Object *obj, void *opaque)
{
VirtMachineState *vms = VIRT_MACHINE(qdev_get_machine());
+ AcpiIortSMMUv3Dev sdev = {0};
GArray *sdev_blob = opaque;
AcpiIortIdMapping idmap;
PlatformBusDevice *pbus;
- AcpiIortSMMUv3Dev sdev;
int min_bus, max_bus;
SysBusDevice *sbdev;
PCIBus *bus;
@@ -375,6 +400,9 @@ static int iort_smmuv3_devices(Object *obj, void *opaque)
}
bus = PCI_BUS(object_property_get_link(obj, "primary-bus", &error_abort));
+ if (object_property_find(obj, "accel")) {
+ sdev.accel = object_property_get_bool(obj, "accel", &error_abort);
+ }
pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev);
sbdev = SYS_BUS_DEVICE(obj);
sdev.base = platform_bus_get_mmio_addr(pbus, sbdev, 0);
@@ -448,10 +476,69 @@ static void create_rc_its_idmaps(GArray *its_idmaps, GArray *smmuv3_devs)
}
}
+static void
+build_iort_rmr_nodes(GArray *table_data, GArray *smmuv3_devices, uint32_t *id)
+{
+ AcpiIortSMMUv3Dev *sdev;
+ AcpiIortIdMapping *idmap;
+ int i;
+
+ for (i = 0; i < smmuv3_devices->len; i++) {
+ uint16_t rmr_len;
+ int bdf;
+
+ sdev = &g_array_index(smmuv3_devices, AcpiIortSMMUv3Dev, i);
+ if (!sdev->accel) {
+ continue;
+ }
+
+ /*
+ * Spec reference:Arm IO Remapping Table(IORT), ARM DEN 0049E.d,
+ * Section 3.1.1.5 "Reserved Memory Range node"
+ */
+ idmap = &g_array_index(sdev->rc_smmu_idmaps, AcpiIortIdMapping, 0);
+ bdf = idmap->input_base;
+ rmr_len = IORT_RMR_COMMON_HEADER_SIZE
+ + (IORT_RMR_NUM_ID_MAPPINGS * ID_MAPPING_ENTRY_SIZE)
+ + (IORT_RMR_NUM_MEM_RANGE_DESC * IORT_RMR_MEM_RANGE_DESC_SIZE);
+
+ /* Table 18 Reserved Memory Range Node */
+ build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */
+ /* Length */
+ build_append_int_noprefix(table_data, rmr_len, 2);
+ build_append_int_noprefix(table_data, 3, 1); /* Revision */
+ build_append_int_noprefix(table_data, (*id)++, 4); /* Identifier */
+ /* Number of ID mappings */
+ build_append_int_noprefix(table_data, IORT_RMR_NUM_ID_MAPPINGS, 4);
+ /* Reference to ID Array */
+ build_append_int_noprefix(table_data, IORT_RMR_COMMON_HEADER_SIZE, 4);
+
+ /* RMR specific data */
+
+ /* Flags */
+ build_append_int_noprefix(table_data, IORT_RMR_FLAGS, 4);
+ /* Number of Memory Range Descriptors */
+ build_append_int_noprefix(table_data, IORT_RMR_NUM_MEM_RANGE_DESC, 4);
+ /* Reference to Memory Range Descriptors */
+ build_append_int_noprefix(table_data, IORT_RMR_COMMON_HEADER_SIZE +
+ (IORT_RMR_NUM_ID_MAPPINGS * ID_MAPPING_ENTRY_SIZE), 4);
+ build_iort_id_mapping(table_data, bdf, idmap->id_count, sdev->offset,
+ 1);
+
+ /* Table 19 Memory Range Descriptor */
+
+ /* Physical Range offset */
+ build_append_int_noprefix(table_data, MSI_IOVA_BASE, 8);
+ /* Physical Range length */
+ build_append_int_noprefix(table_data, MSI_IOVA_LENGTH, 8);
+ build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+ }
+}
+
/*
* Input Output Remapping Table (IORT)
* Conforms to "IO Remapping Table System Software on ARM Platforms",
- * Document number: ARM DEN 0049E.b, Feb 2021
+ * Document number: ARM DEN 0049E.d, Feb 2022
*/
static void
build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
@@ -465,7 +552,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
GArray *smmuv3_devs = g_array_new(false, true, sizeof(AcpiIortSMMUv3Dev));
GArray *rc_its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
- AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id,
+ AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id,
.oem_table_id = vms->oem_table_id };
/* Table 2 The IORT */
acpi_table_begin(&table, table_data);
@@ -491,6 +578,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
nb_nodes++; /* ITS */
rc_mapping_count += rc_its_idmaps->len;
}
+ /* Calculate RMR nodes required. One per SMMUv3 with accelerated mode */
+ for (i = 0; i < num_smmus; i++) {
+ sdev = &g_array_index(smmuv3_devs, AcpiIortSMMUv3Dev, i);
+ if (sdev->accel) {
+ nb_nodes++;
+ }
+ }
} else {
if (vms->its) {
nb_nodes = 2; /* RC and ITS */
@@ -563,7 +657,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
/* Array of ID mappings */
if (smmu_mapping_count) {
/* Output IORT node is the ITS Group node (the first node). */
- build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET);
+ build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0);
}
}
@@ -615,7 +709,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
AcpiIortIdMapping, j);
/* Output IORT node is the SMMUv3 node. */
build_iort_id_mapping(table_data, range->input_base,
- range->id_count, sdev->offset);
+ range->id_count, sdev->offset, 0);
}
}
@@ -628,7 +722,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
range = &g_array_index(rc_its_idmaps, AcpiIortIdMapping, i);
/* Output IORT node is the ITS Group node (the first node). */
build_iort_id_mapping(table_data, range->input_base,
- range->id_count, IORT_NODE_OFFSET);
+ range->id_count, IORT_NODE_OFFSET, 0);
}
}
} else {
@@ -637,9 +731,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
* SMMU: RC -> ITS.
* Output IORT node is the ITS Group node (the first node).
*/
- build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET);
+ build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0);
}
+ build_iort_rmr_nodes(table_data, smmuv3_devs, &id);
acpi_table_end(linker, &table);
g_array_free(rc_its_idmaps, true);
for (i = 0; i < num_smmus; i++) {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 26/37] tests/qtest/bios-tables-test: Update IORT blobs after revision upgrade
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (24 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 25/37] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 27/37] hw/arm/smmuv3: Block migration when accel is enabled Shameer Kolothum
` (11 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Update the reference IORT blobs after revision upgrade for RMR node
support. This affects the aarch64 'virt' IORT tests.
IORT diff is the same for all the tests:
/*
* Intel ACPI Component Architecture
* AML/ASL+ Disassembler version 20230628 (64-bit version)
* Copyright (c) 2000 - 2023 Intel Corporation
*
- * Disassembly of tests/data/acpi/aarch64/virt/IORT, Mon Oct 20 14:42:41 2025
+ * Disassembly of /tmp/aml-B4ZRE3, Mon Oct 20 14:42:41 2025
*
* ACPI Data Table [IORT]
*
* Format: [HexOffset DecimalOffset ByteLength] FieldName : FieldValue (in hex)
*/
[000h 0000 004h] Signature : "IORT" [IO Remapping Table]
[004h 0004 004h] Table Length : 00000080
-[008h 0008 001h] Revision : 03
-[009h 0009 001h] Checksum : B3
+[008h 0008 001h] Revision : 05
+[009h 0009 001h] Checksum : B1
[00Ah 0010 006h] Oem ID : "BOCHS "
[010h 0016 008h] Oem Table ID : "BXPC "
[018h 0024 004h] Oem Revision : 00000001
[01Ch 0028 004h] Asl Compiler ID : "BXPC"
[020h 0032 004h] Asl Compiler Revision : 00000001
...
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
tests/data/acpi/aarch64/virt/IORT | Bin 128 -> 128 bytes
tests/data/acpi/aarch64/virt/IORT.its_off | Bin 172 -> 172 bytes
tests/data/acpi/aarch64/virt/IORT.smmuv3-dev | Bin 364 -> 364 bytes
tests/data/acpi/aarch64/virt/IORT.smmuv3-legacy | Bin 276 -> 276 bytes
tests/qtest/bios-tables-test-allowed-diff.h | 4 ----
5 files changed, 4 deletions(-)
diff --git a/tests/data/acpi/aarch64/virt/IORT b/tests/data/acpi/aarch64/virt/IORT
index 7efd0ce8a6b3928efa7e1373f688ab4c5f50543b..a234aae4c2d04668d34313836d32ca20e19c0880 100644
GIT binary patch
delta 18
ZcmZo*Y+&T_^bZPYU|?Wi-8hk}3;-#Q1d;#%
delta 18
ZcmZo*Y+&T_^bZPYU|?Wi-aL`33;-#O1d;#%
diff --git a/tests/data/acpi/aarch64/virt/IORT.its_off b/tests/data/acpi/aarch64/virt/IORT.its_off
index c10da4e61dd00e7eb062558a2735d49ca0b20620..0cf52b52f671637bf4dbc9e0fc80c3c73d0b01d3 100644
GIT binary patch
delta 18
ZcmZ3(xQ3C-(?2L=4FdxM>(q%{ivTdM1ttIh
delta 18
ZcmZ3(xQ3C-(?2L=4FdxM^Yn>aivTdK1ttIh
diff --git a/tests/data/acpi/aarch64/virt/IORT.smmuv3-dev b/tests/data/acpi/aarch64/virt/IORT.smmuv3-dev
index 67be268f62afbf2d9459540984da5e9340afdaaa..43a15fe2bf6cc650ffcbceff86919ea892928c0e 100644
GIT binary patch
delta 19
acmaFE^oEJc(?2LAhmnDS^~6T5Bt`%|fCYU3
delta 19
acmaFE^oEJc(?2LAhmnDS`P4?PBt`%|eg%C1
diff --git a/tests/data/acpi/aarch64/virt/IORT.smmuv3-legacy b/tests/data/acpi/aarch64/virt/IORT.smmuv3-legacy
index 41981a449fc306b80cccd87ddec3c593a8d72c07..5779d0e225a62b9cd70bebbacb7fd1e519c9e3c4 100644
GIT binary patch
delta 19
acmbQjG=+)F(?2Lggpq-P)oUXc7b5^FiUXej
delta 19
acmbQjG=+)F(?2Lggpq-P*=Hjc7b5^Fhy$Mh
diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index 3279638ad0..dfb8523c8b 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1,5 +1 @@
/* List of comma-separated changed AML files to ignore */
-"tests/data/acpi/aarch64/virt/IORT",
-"tests/data/acpi/aarch64/virt/IORT.its_off",
-"tests/data/acpi/aarch64/virt/IORT.smmuv3-legacy",
-"tests/data/acpi/aarch64/virt/IORT.smmuv3-dev",
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 27/37] hw/arm/smmuv3: Block migration when accel is enabled
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (25 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 26/37] tests/qtest/bios-tables-test: Update IORT blobs after revision upgrade Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 28/37] hw/arm/smmuv3: Add accel property for SMMUv3 device Shameer Kolothum
` (10 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Live migration is not supported when the SMMUv3 accelerator mode is
enabled. Add a migration blocker to prevent migration in this
configuration.
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3.c | 6 ++++++
include/hw/arm/smmuv3.h | 1 +
2 files changed, 7 insertions(+)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index e301bb467d..2be056d792 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -20,6 +20,7 @@
#include "qemu/bitops.h"
#include "hw/core/irq.h"
#include "hw/core/sysbus.h"
+#include "migration/blocker.h"
#include "migration/vmstate.h"
#include "hw/core/qdev-properties.h"
#include "hw/core/qdev.h"
@@ -1925,6 +1926,11 @@ static void smmu_realize(DeviceState *d, Error **errp)
if (s->accel) {
smmuv3_accel_init(s);
+ error_setg(&s->migration_blocker, "Migration not supported with SMMUv3 "
+ "accelerator mode enabled");
+ if (migrate_add_blocker(&s->migration_blocker, errp) < 0) {
+ return;
+ }
}
c->parent_realize(d, &local_err);
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index 5616a8a2be..9c39acd5ca 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -68,6 +68,7 @@ struct SMMUv3State {
bool accel;
struct SMMUv3AccelState *s_accel;
uint64_t msi_gpa;
+ Error *migration_blocker;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 28/37] hw/arm/smmuv3: Add accel property for SMMUv3 device
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (26 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 27/37] hw/arm/smmuv3: Block migration when accel is enabled Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 29/37] hw/arm/smmuv3-accel: Add a property to specify RIL support Shameer Kolothum
` (9 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Add an "accel" property to enable SMMUv3 accelerator mode.
Accelerator mode relies on IORT RMR entries for MSI support and is
therefore not supported when booting with a device tree.
In this mode, the host SMMUv3 operates in nested translation
(Stage-1 + Stage-2), with the guest owning the Stage-1 page tables.
Expose only Stage-1 to the guest to ensure it uses the correct page
table format
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3.c | 32 ++++++++++++++++++++++++++++++++
hw/arm/virt-acpi-build.c | 4 +---
hw/arm/virt.c | 22 +++++++++++++---------
3 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 2be056d792..8ca1d4ad35 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1916,6 +1916,29 @@ static void smmu_reset_exit(Object *obj, ResetType type)
smmuv3_accel_reset(s);
}
+static bool smmu_validate_property(SMMUv3State *s, Error **errp)
+{
+#ifndef CONFIG_ARM_SMMUV3_ACCEL
+ if (s->accel) {
+ error_setg(errp, "accel=on support not compiled in");
+ return false;
+ }
+#endif
+
+ if (!s->accel) {
+ return true;
+ }
+
+ /* If no stage specified, SMMUv3 defaults to stage 1 */
+ if (s->stage && strcmp(s->stage, "1")) {
+ error_setg(errp,
+ "Only stage1 is supported for SMMUv3 with accel=on");
+ return false;
+ }
+
+ return true;
+}
+
static void smmu_realize(DeviceState *d, Error **errp)
{
SMMUState *sys = ARM_SMMU(d);
@@ -1924,6 +1947,10 @@ static void smmu_realize(DeviceState *d, Error **errp)
SysBusDevice *dev = SYS_BUS_DEVICE(d);
Error *local_err = NULL;
+ if (!smmu_validate_property(s, errp)) {
+ return;
+ }
+
if (s->accel) {
smmuv3_accel_init(s);
error_setg(&s->migration_blocker, "Migration not supported with SMMUv3 "
@@ -2029,6 +2056,7 @@ static const Property smmuv3_properties[] = {
* Defaults to stage 1
*/
DEFINE_PROP_STRING("stage", SMMUv3State, stage),
+ DEFINE_PROP_BOOL("accel", SMMUv3State, accel, false),
/* GPA of MSI doorbell, for SMMUv3 accel use. */
DEFINE_PROP_UINT64("msi-gpa", SMMUv3State, msi_gpa, 0),
};
@@ -2052,6 +2080,10 @@ static void smmuv3_class_init(ObjectClass *klass, const void *data)
device_class_set_props(dc, smmuv3_properties);
dc->hotpluggable = false;
dc->user_creatable = true;
+
+ object_class_property_set_description(klass, "accel",
+ "Enable SMMUv3 accelerator support. Allows host SMMUv3 to be "
+ "configured in nested mode for vfio-pci dev assignment");
}
static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 4ae4cbc6cd..3126aca42c 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -400,9 +400,7 @@ static int iort_smmuv3_devices(Object *obj, void *opaque)
}
bus = PCI_BUS(object_property_get_link(obj, "primary-bus", &error_abort));
- if (object_property_find(obj, "accel")) {
- sdev.accel = object_property_get_bool(obj, "accel", &error_abort);
- }
+ sdev.accel = object_property_get_bool(obj, "accel", &error_abort);
pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev);
sbdev = SYS_BUS_DEVICE(obj);
sdev.base = platform_bus_get_mmio_addr(pbus, sbdev, 0);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 899b02e1f7..390845c503 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1491,8 +1491,8 @@ static void create_smmuv3_dt_bindings(const VirtMachineState *vms, hwaddr base,
g_free(node);
}
-static void create_smmuv3_dev_dtb(VirtMachineState *vms,
- DeviceState *dev, PCIBus *bus)
+static void create_smmuv3_dev_dtb(VirtMachineState *vms, DeviceState *dev,
+ PCIBus *bus, Error **errp)
{
PlatformBusDevice *pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev);
SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
@@ -1500,10 +1500,15 @@ static void create_smmuv3_dev_dtb(VirtMachineState *vms,
hwaddr base = platform_bus_get_mmio_addr(pbus, sbdev, 0);
MachineState *ms = MACHINE(vms);
- if (!(vms->bootinfo.firmware_loaded && virt_is_acpi_enabled(vms)) &&
- strcmp("pcie.0", bus->qbus.name)) {
- warn_report("SMMUv3 device only supported with pcie.0 for DT");
- return;
+ if (!(vms->bootinfo.firmware_loaded && virt_is_acpi_enabled(vms))) {
+ if (object_property_get_bool(OBJECT(dev), "accel", &error_abort)) {
+ error_setg(errp, "SMMUv3 with accel=on not supported for DT");
+ return;
+ }
+ if (strcmp("pcie.0", bus->qbus.name)) {
+ warn_report("SMMUv3 device only supported with pcie.0 for DT");
+ return;
+ }
}
base += vms->memmap[VIRT_PLATFORM_BUS].base;
irq += vms->irqmap[VIRT_PLATFORM_BUS];
@@ -3061,8 +3066,7 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
object_property_set_link(OBJECT(dev), "secure-memory",
OBJECT(vms->secure_sysmem), NULL);
}
- if (object_property_find(OBJECT(dev), "accel") &&
- object_property_get_bool(OBJECT(dev), "accel", &error_abort)) {
+ if (object_property_get_bool(OBJECT(dev), "accel", &error_abort)) {
hwaddr db_start = 0;
if (!kvm_enabled() || !kvm_irqchip_in_kernel()) {
@@ -3117,7 +3121,7 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
return;
}
- create_smmuv3_dev_dtb(vms, dev, bus);
+ create_smmuv3_dev_dtb(vms, dev, bus, errp);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 29/37] hw/arm/smmuv3-accel: Add a property to specify RIL support
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (27 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 28/37] hw/arm/smmuv3: Add accel property for SMMUv3 device Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 30/37] hw/arm/smmuv3-accel: Add support for ATS Shameer Kolothum
` (8 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Currently QEMU SMMUv3 has RIL support by default. But if accelerated mode
is enabled, RIL has to be compatible with host SMMUv3 support.
Add a property so that the user can specify this.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 14 ++++++++++++--
hw/arm/smmuv3-accel.h | 4 ++++
hw/arm/smmuv3.c | 9 +++++++++
include/hw/arm/smmuv3.h | 1 +
4 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 33011962e3..df82f1e32a 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -68,8 +68,8 @@ smmuv3_accel_check_hw_compatible(SMMUv3State *s,
return false;
}
- /* QEMU SMMUv3 supports Range Invalidation by default */
- if (FIELD_EX32(info->idr[3], IDR3, RIL) !=
+ /* User can disable QEMU SMMUv3 Range Invalidation support */
+ if (FIELD_EX32(info->idr[3], IDR3, RIL) <
FIELD_EX32(s->idr[3], IDR3, RIL)) {
error_setg(errp, "Host SMMUv3 doesn't support Range Invalidation");
return false;
@@ -646,6 +646,16 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.get_msi_direct_gpa = smmuv3_accel_get_msi_gpa,
};
+void smmuv3_accel_idr_override(SMMUv3State *s)
+{
+ if (!s->accel) {
+ return;
+ }
+
+ /* By default QEMU SMMUv3 has RIL. Update IDR3 if user has disabled it */
+ s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, s->ril);
+}
+
/* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp)
{
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index 41b37e3122..a8a64802ec 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -49,6 +49,7 @@ bool smmuv3_accel_install_ste_range(SMMUv3State *s, SMMUSIDRange *range,
bool smmuv3_accel_attach_gbpa_hwpt(SMMUv3State *s, Error **errp);
bool smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
Error **errp);
+void smmuv3_accel_idr_override(SMMUv3State *s);
void smmuv3_accel_reset(SMMUv3State *s);
#else
static inline void smmuv3_accel_init(SMMUv3State *s)
@@ -76,6 +77,9 @@ smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
{
return true;
}
+static inline void smmuv3_accel_idr_override(SMMUv3State *s)
+{
+}
static inline void smmuv3_accel_reset(SMMUv3State *s)
{
}
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 8ca1d4ad35..cb619f19df 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -305,6 +305,7 @@ static void smmuv3_init_id_regs(SMMUv3State *s)
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1);
s->aidr = 0x1;
+ smmuv3_accel_idr_override(s);
}
static void smmuv3_reset(SMMUv3State *s)
@@ -1926,6 +1927,10 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
#endif
if (!s->accel) {
+ if (!s->ril) {
+ error_setg(errp, "ril can only be disabled if accel=on");
+ return false;
+ }
return true;
}
@@ -2059,6 +2064,8 @@ static const Property smmuv3_properties[] = {
DEFINE_PROP_BOOL("accel", SMMUv3State, accel, false),
/* GPA of MSI doorbell, for SMMUv3 accel use. */
DEFINE_PROP_UINT64("msi-gpa", SMMUv3State, msi_gpa, 0),
+ /* RIL can be turned off for accel cases */
+ DEFINE_PROP_BOOL("ril", SMMUv3State, ril, true),
};
static void smmuv3_instance_init(Object *obj)
@@ -2084,6 +2091,8 @@ static void smmuv3_class_init(ObjectClass *klass, const void *data)
object_class_property_set_description(klass, "accel",
"Enable SMMUv3 accelerator support. Allows host SMMUv3 to be "
"configured in nested mode for vfio-pci dev assignment");
+ object_class_property_set_description(klass, "ril",
+ "Disable range invalidation support (for accel=on)");
}
static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index 9c39acd5ca..533a2182e8 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -69,6 +69,7 @@ struct SMMUv3State {
struct SMMUv3AccelState *s_accel;
uint64_t msi_gpa;
Error *migration_blocker;
+ bool ril;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 30/37] hw/arm/smmuv3-accel: Add support for ATS
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (28 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 29/37] hw/arm/smmuv3-accel: Add a property to specify RIL support Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 10:43 ` [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits Shameer Kolothum
` (7 subsequent siblings)
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
QEMU SMMUv3 does not enable ATS (Address Translation Services) by default.
When accelerated mode is enabled and the host SMMUv3 supports ATS, it can
be useful to report ATS capability to the guest so it can take advantage
of it if the device also supports ATS.
Note: ATS support cannot be reliably detected from the host SMMUv3 IDR
registers alone, as firmware ACPI IORT tables may override them. The
user must therefore ensure the support before enabling it.
The ATS support enabled here is only relevant for vfio-pci endpoints,
as SMMUv3 accelerated mode does not support emulated endpoint devices.
QEMU’s SMMUv3 implementation still lacks support for handling ATS
translation requests, which would be required for emulated endpoints.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 3 +++
hw/arm/smmuv3.c | 24 +++++++++++++++++++++++-
hw/arm/virt-acpi-build.c | 10 ++++++++--
include/hw/arm/smmuv3.h | 1 +
4 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index df82f1e32a..a97abc1f79 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -654,6 +654,9 @@ void smmuv3_accel_idr_override(SMMUv3State *s)
/* By default QEMU SMMUv3 has RIL. Update IDR3 if user has disabled it */
s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, s->ril);
+
+ /* QEMU SMMUv3 has no ATS. Advertise ATS if opt-in by property */
+ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, s->ats);
}
/* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index cb619f19df..ca086ba00a 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1498,13 +1498,27 @@ static int smmuv3_cmdq_consume(SMMUv3State *s, Error **errp)
*/
smmuv3_range_inval(bs, &cmd, SMMU_STAGE_2);
break;
+ case SMMU_CMD_ATC_INV:
+ {
+ SMMUDevice *sdev = smmu_find_sdev(bs, CMD_SID(&cmd));
+
+ if (!sdev || !s->ats) {
+ trace_smmuv3_unhandled_cmd(type);
+ break;
+ }
+
+ if (!smmuv3_accel_issue_inv_cmd(s, &cmd, sdev, errp)) {
+ cmd_error = SMMU_CERROR_ILL;
+ break;
+ }
+ break;
+ }
case SMMU_CMD_TLBI_EL3_ALL:
case SMMU_CMD_TLBI_EL3_VA:
case SMMU_CMD_TLBI_EL2_ALL:
case SMMU_CMD_TLBI_EL2_ASID:
case SMMU_CMD_TLBI_EL2_VA:
case SMMU_CMD_TLBI_EL2_VAA:
- case SMMU_CMD_ATC_INV:
case SMMU_CMD_PRI_RESP:
case SMMU_CMD_RESUME:
case SMMU_CMD_STALL_TERM:
@@ -1931,6 +1945,10 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
error_setg(errp, "ril can only be disabled if accel=on");
return false;
}
+ if (s->ats) {
+ error_setg(errp, "ats can only be enabled if accel=on");
+ return false;
+ }
return true;
}
@@ -2066,6 +2084,7 @@ static const Property smmuv3_properties[] = {
DEFINE_PROP_UINT64("msi-gpa", SMMUv3State, msi_gpa, 0),
/* RIL can be turned off for accel cases */
DEFINE_PROP_BOOL("ril", SMMUv3State, ril, true),
+ DEFINE_PROP_BOOL("ats", SMMUv3State, ats, false),
};
static void smmuv3_instance_init(Object *obj)
@@ -2093,6 +2112,9 @@ static void smmuv3_class_init(ObjectClass *klass, const void *data)
"configured in nested mode for vfio-pci dev assignment");
object_class_property_set_description(klass, "ril",
"Disable range invalidation support (for accel=on)");
+ object_class_property_set_description(klass, "ats",
+ "Enable/disable ATS support (for accel=on). Please ensure host "
+ "platform has ATS support before enabling this");
}
static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 3126aca42c..c145678185 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -346,6 +346,7 @@ typedef struct AcpiIortSMMUv3Dev {
/* Offset of the SMMUv3 IORT Node relative to the start of the IORT */
size_t offset;
bool accel;
+ bool ats;
} AcpiIortSMMUv3Dev;
/*
@@ -401,6 +402,7 @@ static int iort_smmuv3_devices(Object *obj, void *opaque)
bus = PCI_BUS(object_property_get_link(obj, "primary-bus", &error_abort));
sdev.accel = object_property_get_bool(obj, "accel", &error_abort);
+ sdev.ats = object_property_get_bool(obj, "ats", &error_abort);
pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev);
sbdev = SYS_BUS_DEVICE(obj);
sdev.base = platform_bus_get_mmio_addr(pbus, sbdev, 0);
@@ -544,6 +546,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
int i, nb_nodes, rc_mapping_count;
AcpiIortSMMUv3Dev *sdev;
size_t node_size;
+ bool ats_needed = false;
int num_smmus = 0;
uint32_t id = 0;
int rc_smmu_idmaps_len = 0;
@@ -579,6 +582,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
/* Calculate RMR nodes required. One per SMMUv3 with accelerated mode */
for (i = 0; i < num_smmus; i++) {
sdev = &g_array_index(smmuv3_devs, AcpiIortSMMUv3Dev, i);
+ if (sdev->ats) {
+ ats_needed = true;
+ }
if (sdev->accel) {
nb_nodes++;
}
@@ -678,8 +684,8 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
build_append_int_noprefix(table_data, 0, 2); /* Reserved */
/* Table 15 Memory Access Flags */
build_append_int_noprefix(table_data, 0x3 /* CCA = CPM = DACS = 1 */, 1);
-
- build_append_int_noprefix(table_data, 0, 4); /* ATS Attribute */
+ /* ATS Attribute */
+ build_append_int_noprefix(table_data, ats_needed, 4);
/* MCFG pci_segment */
build_append_int_noprefix(table_data, 0, 4); /* PCI Segment number */
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index 533a2182e8..242d6429ed 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -70,6 +70,7 @@ struct SMMUv3State {
uint64_t msi_gpa;
Error *migration_blocker;
bool ril;
+ bool ats;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (29 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 30/37] hw/arm/smmuv3-accel: Add support for ATS Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-02-02 14:39 ` Eric Auger
2026-01-26 10:43 ` [PATCH v9 32/37] backends/iommufd: Retrieve PASID width from iommufd_backend_get_device_info() Shameer Kolothum
` (6 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
QEMU SMMUv3 currently sets the output address size (OAS) to 44 bits.
With accelerator mode enabled, a device may use SVA, where CPU page tables
are shared with the SMMU, requiring an OAS at least as large as the
CPU’s output address size. A user option is added to configure this.
However, the OAS value advertised by the virtual SMMU must remain
compatible with the capabilities of the host SMMUv3. In accelerated
mode, the host SMMU performs stage-2 translation and must be able to
consume the intermediate physical addresses (IPA) produced by stage-1.
The OAS exposed by the virtual SMMU defines the maximum IPA width that
stage-1 translations may generate. For AArch64 implementations, the
maximum usable IPA size on the host SMMU is determined by its own OAS.
Check that the configured OAS does not exceed what the host SMMU
can safely support.
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 22 ++++++++++++++++++++++
hw/arm/smmuv3.c | 16 +++++++++++++++-
include/hw/arm/smmuv3-common.h | 5 ++++-
include/hw/arm/smmuv3.h | 1 +
4 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index a97abc1f79..ea420afeb7 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -27,6 +27,14 @@
static MemoryRegion root, sysmem;
static AddressSpace *shared_as_sysmem;
+static int smmuv3_oas_bits(uint32_t oas)
+{
+ static const int map[] = { 32, 36, 40, 42, 44, 48, 52, 56 };
+
+ g_assert(oas < ARRAY_SIZE(map));
+ return map[oas];
+}
+
static bool
smmuv3_accel_check_hw_compatible(SMMUv3State *s,
struct iommu_hw_info_arm_smmuv3 *info,
@@ -74,6 +82,15 @@ smmuv3_accel_check_hw_compatible(SMMUv3State *s,
error_setg(errp, "Host SMMUv3 doesn't support Range Invalidation");
return false;
}
+ /* Check OAS value opted is compatible with Host SMMUv3 IPA */
+ if (FIELD_EX32(info->idr[5], IDR5, OAS) <
+ FIELD_EX32(s->idr[5], IDR5, OAS)) {
+ error_setg(errp, "Host SMMUv3 supports only %d-bit IPA, but the vSMMU "
+ "OAS implies %d-bit IPA",
+ smmuv3_oas_bits(FIELD_EX32(info->idr[5], IDR5, OAS)),
+ smmuv3_oas_bits(FIELD_EX32(s->idr[5], IDR5, OAS)));
+ return false;
+ }
/* QEMU SMMUv3 supports GRAN4K/GRAN16K/GRAN64K translation granules */
if (FIELD_EX32(info->idr[5], IDR5, GRAN4K) !=
@@ -657,6 +674,11 @@ void smmuv3_accel_idr_override(SMMUv3State *s)
/* QEMU SMMUv3 has no ATS. Advertise ATS if opt-in by property */
s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, s->ats);
+
+ /* Advertise 48-bit OAS in IDR5 when requested (default is 44 bits). */
+ if (s->oas == SMMU_OAS_48BIT) {
+ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS_48);
+ }
}
/* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index ca086ba00a..cb02184d2d 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -299,7 +299,8 @@ static void smmuv3_init_id_regs(SMMUv3State *s)
s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1);
s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2);
- s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */
+ /* OAS: 44 bits */
+ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS_44);
/* 4K, 16K and 64K granule support */
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
@@ -1949,6 +1950,10 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
error_setg(errp, "ats can only be enabled if accel=on");
return false;
}
+ if (s->oas != SMMU_OAS_44BIT) {
+ error_setg(errp, "OAS must be 44 bits when accel=off");
+ return false;
+ }
return true;
}
@@ -1959,6 +1964,11 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
return false;
}
+ if (s->oas != SMMU_OAS_44BIT && s->oas != SMMU_OAS_48BIT) {
+ error_setg(errp, "OAS can only be set to 44 or 48 bits");
+ return false;
+ }
+
return true;
}
@@ -2085,6 +2095,7 @@ static const Property smmuv3_properties[] = {
/* RIL can be turned off for accel cases */
DEFINE_PROP_BOOL("ril", SMMUv3State, ril, true),
DEFINE_PROP_BOOL("ats", SMMUv3State, ats, false),
+ DEFINE_PROP_UINT8("oas", SMMUv3State, oas, 44),
};
static void smmuv3_instance_init(Object *obj)
@@ -2115,6 +2126,9 @@ static void smmuv3_class_init(ObjectClass *klass, const void *data)
object_class_property_set_description(klass, "ats",
"Enable/disable ATS support (for accel=on). Please ensure host "
"platform has ATS support before enabling this");
+ object_class_property_set_description(klass, "oas",
+ "Specify Output Address Size (for accel=on). Supported values "
+ "are 44 or 48 bits. Defaults to 44 bits");
}
static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
diff --git a/include/hw/arm/smmuv3-common.h b/include/hw/arm/smmuv3-common.h
index 415b7ccde5..abe3565357 100644
--- a/include/hw/arm/smmuv3-common.h
+++ b/include/hw/arm/smmuv3-common.h
@@ -342,7 +342,10 @@ REG32(IDR5, 0x14)
FIELD(IDR5, VAX, 10, 2);
FIELD(IDR5, STALL_MAX, 16, 16);
-#define SMMU_IDR5_OAS 4
+#define SMMU_OAS_44BIT 44
+#define SMMU_OAS_48BIT 48
+#define SMMU_IDR5_OAS_44 4
+#define SMMU_IDR5_OAS_48 5
REG32(IIDR, 0x18)
REG32(AIDR, 0x1c)
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index 242d6429ed..d488a39cd0 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -71,6 +71,7 @@ struct SMMUv3State {
Error *migration_blocker;
bool ril;
bool ats;
+ uint8_t oas;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-01-26 10:43 ` [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits Shameer Kolothum
@ 2026-02-02 14:39 ` Eric Auger
2026-02-02 15:11 ` Shameer Kolothum Thodi
0 siblings, 1 reply; 68+ messages in thread
From: Eric Auger @ 2026-02-02 14:39 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: peter.maydell, jgg, nicolinc, ddutile, berrange, clg, alex,
nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju
Hi Shameer, Nicolin,
On 1/26/26 11:43 AM, Shameer Kolothum wrote:
> QEMU SMMUv3 currently sets the output address size (OAS) to 44 bits.
> With accelerator mode enabled, a device may use SVA, where CPU page tables
> are shared with the SMMU, requiring an OAS at least as large as the
> CPU’s output address size. A user option is added to configure this.
>
> However, the OAS value advertised by the virtual SMMU must remain
> compatible with the capabilities of the host SMMUv3. In accelerated
> mode, the host SMMU performs stage-2 translation and must be able to
> consume the intermediate physical addresses (IPA) produced by stage-1.
>
> The OAS exposed by the virtual SMMU defines the maximum IPA width that
> stage-1 translations may generate. For AArch64 implementations, the
> maximum usable IPA size on the host SMMU is determined by its own OAS.
> Check that the configured OAS does not exceed what the host SMMU
> can safely support.
After discussion with Kubevirt guys, the management of current RIL,
ssidsize, ats and oas options look touchy because it is tricky for them
to introspect the host values.
In cold plug case at least it looks feasible to use IOMMU_GET_HW_INFO()
to retrieve host info:
RIL is in IDR3
ssidsize in IDR1
OAS in IDR5
ATS may be more touchy but maybe this can be introspected too?
I would advocate to turn those options into _AUTO options to give a
chance to the user to ask for host derived values.
Currently in include/hw/qdev-properties.h we have
DEFINE_PROP_ON_OFF_AUTO for a bool and
DEFINE_PROP_ON_OFF_AUTO_BIT64 for a 64b
RIL can match bool.
Others may need a new DEFINE_PROP_ON_OFF_AUTO_* one.
Note such kind of auto property was introduced for virtio-iommu
(DEFINE_PROP_GRANULE_MODE)
What do you think? Most probably this has been dicussed in the past but
I do not necessarily remember the outputs.
Thanks
Eric
>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/arm/smmuv3-accel.c | 22 ++++++++++++++++++++++
> hw/arm/smmuv3.c | 16 +++++++++++++++-
> include/hw/arm/smmuv3-common.h | 5 ++++-
> include/hw/arm/smmuv3.h | 1 +
> 4 files changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index a97abc1f79..ea420afeb7 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -27,6 +27,14 @@
> static MemoryRegion root, sysmem;
> static AddressSpace *shared_as_sysmem;
>
> +static int smmuv3_oas_bits(uint32_t oas)
> +{
> + static const int map[] = { 32, 36, 40, 42, 44, 48, 52, 56 };
> +
> + g_assert(oas < ARRAY_SIZE(map));
> + return map[oas];
> +}
> +
> static bool
> smmuv3_accel_check_hw_compatible(SMMUv3State *s,
> struct iommu_hw_info_arm_smmuv3 *info,
> @@ -74,6 +82,15 @@ smmuv3_accel_check_hw_compatible(SMMUv3State *s,
> error_setg(errp, "Host SMMUv3 doesn't support Range Invalidation");
> return false;
> }
> + /* Check OAS value opted is compatible with Host SMMUv3 IPA */
> + if (FIELD_EX32(info->idr[5], IDR5, OAS) <
> + FIELD_EX32(s->idr[5], IDR5, OAS)) {
> + error_setg(errp, "Host SMMUv3 supports only %d-bit IPA, but the vSMMU "
> + "OAS implies %d-bit IPA",
> + smmuv3_oas_bits(FIELD_EX32(info->idr[5], IDR5, OAS)),
> + smmuv3_oas_bits(FIELD_EX32(s->idr[5], IDR5, OAS)));
> + return false;
> + }
>
> /* QEMU SMMUv3 supports GRAN4K/GRAN16K/GRAN64K translation granules */
> if (FIELD_EX32(info->idr[5], IDR5, GRAN4K) !=
> @@ -657,6 +674,11 @@ void smmuv3_accel_idr_override(SMMUv3State *s)
>
> /* QEMU SMMUv3 has no ATS. Advertise ATS if opt-in by property */
> s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, s->ats);
> +
> + /* Advertise 48-bit OAS in IDR5 when requested (default is 44 bits). */
> + if (s->oas == SMMU_OAS_48BIT) {
> + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS_48);
> + }
> }
>
> /* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
> diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> index ca086ba00a..cb02184d2d 100644
> --- a/hw/arm/smmuv3.c
> +++ b/hw/arm/smmuv3.c
> @@ -299,7 +299,8 @@ static void smmuv3_init_id_regs(SMMUv3State *s)
> s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1);
> s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2);
>
> - s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */
> + /* OAS: 44 bits */
> + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS_44);
> /* 4K, 16K and 64K granule support */
> s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
> s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
> @@ -1949,6 +1950,10 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
> error_setg(errp, "ats can only be enabled if accel=on");
> return false;
> }
> + if (s->oas != SMMU_OAS_44BIT) {
> + error_setg(errp, "OAS must be 44 bits when accel=off");
> + return false;
> + }
> return true;
> }
>
> @@ -1959,6 +1964,11 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
> return false;
> }
>
> + if (s->oas != SMMU_OAS_44BIT && s->oas != SMMU_OAS_48BIT) {
> + error_setg(errp, "OAS can only be set to 44 or 48 bits");
> + return false;
> + }
> +
> return true;
> }
>
> @@ -2085,6 +2095,7 @@ static const Property smmuv3_properties[] = {
> /* RIL can be turned off for accel cases */
> DEFINE_PROP_BOOL("ril", SMMUv3State, ril, true),
> DEFINE_PROP_BOOL("ats", SMMUv3State, ats, false),
> + DEFINE_PROP_UINT8("oas", SMMUv3State, oas, 44),
> };
>
> static void smmuv3_instance_init(Object *obj)
> @@ -2115,6 +2126,9 @@ static void smmuv3_class_init(ObjectClass *klass, const void *data)
> object_class_property_set_description(klass, "ats",
> "Enable/disable ATS support (for accel=on). Please ensure host "
> "platform has ATS support before enabling this");
> + object_class_property_set_description(klass, "oas",
> + "Specify Output Address Size (for accel=on). Supported values "
> + "are 44 or 48 bits. Defaults to 44 bits");
> }
>
> static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
> diff --git a/include/hw/arm/smmuv3-common.h b/include/hw/arm/smmuv3-common.h
> index 415b7ccde5..abe3565357 100644
> --- a/include/hw/arm/smmuv3-common.h
> +++ b/include/hw/arm/smmuv3-common.h
> @@ -342,7 +342,10 @@ REG32(IDR5, 0x14)
> FIELD(IDR5, VAX, 10, 2);
> FIELD(IDR5, STALL_MAX, 16, 16);
>
> -#define SMMU_IDR5_OAS 4
> +#define SMMU_OAS_44BIT 44
> +#define SMMU_OAS_48BIT 48
> +#define SMMU_IDR5_OAS_44 4
> +#define SMMU_IDR5_OAS_48 5
>
> REG32(IIDR, 0x18)
> REG32(AIDR, 0x1c)
> diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
> index 242d6429ed..d488a39cd0 100644
> --- a/include/hw/arm/smmuv3.h
> +++ b/include/hw/arm/smmuv3.h
> @@ -71,6 +71,7 @@ struct SMMUv3State {
> Error *migration_blocker;
> bool ril;
> bool ats;
> + uint8_t oas;
> };
>
> typedef enum {
^ permalink raw reply [flat|nested] 68+ messages in thread* RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 14:39 ` Eric Auger
@ 2026-02-02 15:11 ` Shameer Kolothum Thodi
2026-02-02 15:19 ` Jason Gunthorpe
2026-02-02 15:29 ` Eric Auger
0 siblings, 2 replies; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-02-02 15:11 UTC (permalink / raw)
To: eric.auger@redhat.com, qemu-arm@nongnu.org, qemu-devel@nongnu.org
Cc: peter.maydell@linaro.org, Jason Gunthorpe, Nicolin Chen,
ddutile@redhat.com, berrange@redhat.com, clg@redhat.com,
alex@shazbot.org, Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
Hi Eric,
> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: 02 February 2026 14:40
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; Jason Gunthorpe <jgg@nvidia.com>; Nicolin
> Chen <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
> specify OAS bits
>
> External email: Use caution opening links or attachments
>
>
> Hi Shameer, Nicolin,
> On 1/26/26 11:43 AM, Shameer Kolothum wrote:
> > QEMU SMMUv3 currently sets the output address size (OAS) to 44 bits.
> > With accelerator mode enabled, a device may use SVA, where CPU page
> tables
> > are shared with the SMMU, requiring an OAS at least as large as the
> > CPU’s output address size. A user option is added to configure this.
> >
> > However, the OAS value advertised by the virtual SMMU must remain
> > compatible with the capabilities of the host SMMUv3. In accelerated
> > mode, the host SMMU performs stage-2 translation and must be able to
> > consume the intermediate physical addresses (IPA) produced by stage-1.
> >
> > The OAS exposed by the virtual SMMU defines the maximum IPA width that
> > stage-1 translations may generate. For AArch64 implementations, the
> > maximum usable IPA size on the host SMMU is determined by its own OAS.
> > Check that the configured OAS does not exceed what the host SMMU
> > can safely support.
>
> After discussion with Kubevirt guys, the management of current RIL,
> ssidsize, ats and oas options look touchy because it is tricky for them
> to introspect the host values.
I may be wrong, but it looks like Kubevirt actually makes use of libvirt. Or
is it independent?
The reason I am asking is Nathan is already working on the libvirt changes
for this here:
https://lists.libvirt.org/archives/list/devel@lists.libvirt.org/thread/5GG76AQTTDUHW5KRANPY3QUII4ZOEYRP/
And the _AUTO suggestion might impact that as well.
> In cold plug case at least it looks feasible to use IOMMU_GET_HW_INFO()
> to retrieve host info:
>
> RIL is in IDR3
> ssidsize in IDR1
> OAS in IDR5
> ATS may be more touchy but maybe this can be introspected too?
Yeah. ATS might require some kernel plumbing as BIOS can override it.
> I would advocate to turn those options into _AUTO options to give a
> chance to the user to ask for host derived values.
>
> Currently in include/hw/qdev-properties.h we have
> DEFINE_PROP_ON_OFF_AUTO for a bool and
> DEFINE_PROP_ON_OFF_AUTO_BIT64 for a 64b
>
> RIL can match bool.
> Others may need a new DEFINE_PROP_ON_OFF_AUTO_* one.
>
> Note such kind of auto property was introduced for virtio-iommu
> (DEFINE_PROP_GRANULE_MODE)
>
> What do you think? Most probably this has been dicussed in the past but
> I do not necessarily remember the outputs.
IIRC, the only conclusion was that the user has to specify the SMMUv3
parameters. Don't think the _AUTO option was discussed previously.
If this is a very useful thing to have, I can take a look.
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread
* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 15:11 ` Shameer Kolothum Thodi
@ 2026-02-02 15:19 ` Jason Gunthorpe
2026-02-02 15:38 ` Shameer Kolothum Thodi
2026-02-02 15:29 ` Eric Auger
1 sibling, 1 reply; 68+ messages in thread
From: Jason Gunthorpe @ 2026-02-02 15:19 UTC (permalink / raw)
To: Shameer Kolothum Thodi
Cc: eric.auger@redhat.com, qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
On Mon, Feb 02, 2026 at 03:11:28PM +0000, Shameer Kolothum Thodi wrote:
> > RIL is in IDR3
> > ssidsize in IDR1
> > OAS in IDR5
> > ATS may be more touchy but maybe this can be introspected too?
>
> Yeah. ATS might require some kernel plumbing as BIOS can override it.
We can treat ATS as a per-PCIe device property.. I think it would be
fine to tell the SMMU that it always has ATS support, it will never do
anything with it unless it sees a PCIe device with an ATS cap, and the
physical STE generated by the hypervisor should sanitize the EATS.
BIOS overriding it should be reflected as the devices being reported
as not supporting ATS, qemu should have a per-device flag to disable
ATS.
Not sure that helps libvirt side though..
Jason
^ permalink raw reply [flat|nested] 68+ messages in thread
* RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 15:19 ` Jason Gunthorpe
@ 2026-02-02 15:38 ` Shameer Kolothum Thodi
2026-02-02 16:00 ` Jason Gunthorpe
0 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-02-02 15:38 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: eric.auger@redhat.com, qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
> -----Original Message-----
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: 02 February 2026 15:19
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
> specify OAS bits
>
> On Mon, Feb 02, 2026 at 03:11:28PM +0000, Shameer Kolothum Thodi
> wrote:
> > > RIL is in IDR3
> > > ssidsize in IDR1
> > > OAS in IDR5
> > > ATS may be more touchy but maybe this can be introspected too?
> >
> > Yeah. ATS might require some kernel plumbing as BIOS can override it.
>
> We can treat ATS as a per-PCIe device property.. I think it would be
> fine to tell the SMMU that it always has ATS support, it will never do
> anything with it unless it sees a PCIe device with an ATS cap, and the
> physical STE generated by the hypervisor should sanitize the EATS.
>
> BIOS overriding it should be reflected as the devices being reported
> as not supporting ATS, qemu should have a per-device flag to disable
> ATS.
Do we have way to detect that(IOMMU_FWSPEC_PCI_RC_ATS) from
userspace now?
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread
* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 15:38 ` Shameer Kolothum Thodi
@ 2026-02-02 16:00 ` Jason Gunthorpe
2026-02-02 16:03 ` Shameer Kolothum Thodi
0 siblings, 1 reply; 68+ messages in thread
From: Jason Gunthorpe @ 2026-02-02 16:00 UTC (permalink / raw)
To: Shameer Kolothum Thodi
Cc: eric.auger@redhat.com, qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
On Mon, Feb 02, 2026 at 03:38:50PM +0000, Shameer Kolothum Thodi wrote:
> > We can treat ATS as a per-PCIe device property.. I think it would be
> > fine to tell the SMMU that it always has ATS support, it will never do
> > anything with it unless it sees a PCIe device with an ATS cap, and the
> > physical STE generated by the hypervisor should sanitize the EATS.
> >
> > BIOS overriding it should be reflected as the devices being reported
> > as not supporting ATS, qemu should have a per-device flag to disable
> > ATS.
>
> Do we have way to detect that(IOMMU_FWSPEC_PCI_RC_ATS) from
> userspace now?
I don't think so.. I was describing how I suspect that should work.
The iommu driver is the only entity that decides if ATS should be
enabled per-device, so it should report back to userspace in iommufd
if the device is allowed to enable ATS or not. That should roll up any
FW overrides and the PCI cap block.
Jason
^ permalink raw reply [flat|nested] 68+ messages in thread
* RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 16:00 ` Jason Gunthorpe
@ 2026-02-02 16:03 ` Shameer Kolothum Thodi
2026-02-10 15:12 ` Shameer Kolothum Thodi
0 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-02-02 16:03 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: eric.auger@redhat.com, qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
> -----Original Message-----
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: 02 February 2026 16:00
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
> specify OAS bits
>
> On Mon, Feb 02, 2026 at 03:38:50PM +0000, Shameer Kolothum Thodi
> wrote:
>
> > > We can treat ATS as a per-PCIe device property.. I think it would be
> > > fine to tell the SMMU that it always has ATS support, it will never do
> > > anything with it unless it sees a PCIe device with an ATS cap, and the
> > > physical STE generated by the hypervisor should sanitize the EATS.
> > >
> > > BIOS overriding it should be reflected as the devices being reported
> > > as not supporting ATS, qemu should have a per-device flag to disable
> > > ATS.
> >
> > Do we have way to detect that(IOMMU_FWSPEC_PCI_RC_ATS) from
> > userspace now?
>
> I don't think so.. I was describing how I suspect that should work.
>
> The iommu driver is the only entity that decides if ATS should be
> enabled per-device, so it should report back to userspace in iommufd
> if the device is allowed to enable ATS or not. That should roll up any
> FW overrides and the PCI cap block.
Right. May be just like the out_max_pasid_log2. Will take a look.
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread
* RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 16:03 ` Shameer Kolothum Thodi
@ 2026-02-10 15:12 ` Shameer Kolothum Thodi
2026-02-10 16:01 ` Eric Auger
2026-03-04 7:47 ` Eric Auger
0 siblings, 2 replies; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-02-10 15:12 UTC (permalink / raw)
To: Jason Gunthorpe, eric.auger@redhat.com
Cc: qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
Hi Eric,
> -----Original Message-----
> From: Shameer Kolothum Thodi
> Sent: 02 February 2026 16:04
> To: Jason Gunthorpe <jgg@nvidia.com>
> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
> specify OAS bits
>
>
>
> > -----Original Message-----
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: 02 February 2026 16:00
> > To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
> > Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
> > devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
> > <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
> > clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
> > Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
> > wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
> > jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
> > zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
> > <kjaju@nvidia.com>
> > Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
> > specify OAS bits
> >
> > On Mon, Feb 02, 2026 at 03:38:50PM +0000, Shameer Kolothum Thodi
> > wrote:
> >
> > > > We can treat ATS as a per-PCIe device property.. I think it would
> > > > be fine to tell the SMMU that it always has ATS support, it will
> > > > never do anything with it unless it sees a PCIe device with an ATS
> > > > cap, and the physical STE generated by the hypervisor should sanitize the
> EATS.
> > > >
> > > > BIOS overriding it should be reflected as the devices being
> > > > reported as not supporting ATS, qemu should have a per-device flag
> > > > to disable ATS.
> > >
> > > Do we have way to detect that(IOMMU_FWSPEC_PCI_RC_ATS) from
> > > userspace now?
> >
> > I don't think so.. I was describing how I suspect that should work.
> >
> > The iommu driver is the only entity that decides if ATS should be
> > enabled per-device, so it should report back to userspace in iommufd
> > if the device is allowed to enable ATS or not. That should roll up any
> > FW overrides and the PCI cap block.
>
> Right. May be just like the out_max_pasid_log2. Will take a look.
Looking at adding the ATS detection support through iommufd, one
idea is to extend the enum iommufd_hw_capabilities with
IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED. The question I have
now is whether we need to differentiate an old kernel which doesn't
have this support or not.
This is how the usage would look with an added "ats" property on the
vfio-pci device:
-device vfio-pci,...,ats=on
if ATS_NOT_SUPPORTED then reject
-device vfio-pci,...,ats=off
keep ATS off
-device vfio-pci,...,ats=auto (default)
if ATS_NOT_SUPPORTED, disable ATS
otherwise, enable ATS if the PCIe ATS capability is present
(this also covers behaviour on older kernels)
Does the above make sense?
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-10 15:12 ` Shameer Kolothum Thodi
@ 2026-02-10 16:01 ` Eric Auger
2026-02-10 16:08 ` Shameer Kolothum Thodi
2026-03-04 7:47 ` Eric Auger
1 sibling, 1 reply; 68+ messages in thread
From: Eric Auger @ 2026-02-10 16:01 UTC (permalink / raw)
To: Shameer Kolothum Thodi, Jason Gunthorpe
Cc: qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
Hi Shameer,
On 2/10/26 4:12 PM, Shameer Kolothum Thodi wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Shameer Kolothum Thodi
>> Sent: 02 February 2026 16:04
>> To: Jason Gunthorpe <jgg@nvidia.com>
>> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
>> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
>> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
>> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
>> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
>> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
>> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
>> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
>> <kjaju@nvidia.com>
>> Subject: RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
>> specify OAS bits
>>
>>
>>
>>> -----Original Message-----
>>> From: Jason Gunthorpe <jgg@nvidia.com>
>>> Sent: 02 February 2026 16:00
>>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
>>> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
>>> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
>>> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
>>> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
>>> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
>>> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
>>> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
>>> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
>>> <kjaju@nvidia.com>
>>> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
>>> specify OAS bits
>>>
>>> On Mon, Feb 02, 2026 at 03:38:50PM +0000, Shameer Kolothum Thodi
>>> wrote:
>>>
>>>>> We can treat ATS as a per-PCIe device property.. I think it would
>>>>> be fine to tell the SMMU that it always has ATS support, it will
>>>>> never do anything with it unless it sees a PCIe device with an ATS
>>>>> cap, and the physical STE generated by the hypervisor should sanitize the
>> EATS.
>>>>> BIOS overriding it should be reflected as the devices being
>>>>> reported as not supporting ATS, qemu should have a per-device flag
>>>>> to disable ATS.
>>>> Do we have way to detect that(IOMMU_FWSPEC_PCI_RC_ATS) from
>>>> userspace now?
>>> I don't think so.. I was describing how I suspect that should work.
>>>
>>> The iommu driver is the only entity that decides if ATS should be
>>> enabled per-device, so it should report back to userspace in iommufd
>>> if the device is allowed to enable ATS or not. That should roll up any
>>> FW overrides and the PCI cap block.
>> Right. May be just like the out_max_pasid_log2. Will take a look.
> Looking at adding the ATS detection support through iommufd, one
> idea is to extend the enum iommufd_hw_capabilities with
> IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED. The question I have
> now is whether we need to differentiate an old kernel which doesn't
> have this support or not.
>
> This is how the usage would look with an added "ats" property on the
> vfio-pci device:
>
> -device vfio-pci,...,ats=on
> if ATS_NOT_SUPPORTED then reject
How do you distinguish an old kernel which is not capable of returning
the cap info from a new kernel that effectively returns the capability
is not supported.
> -device vfio-pci,...,ats=off
> keep ATS off
> -device vfio-pci,...,ats=auto (default)
> if ATS_NOT_SUPPORTED, disable ATS
> otherwise, enable ATS if the PCIe ATS capability is present
I am not sure how it works with an old kernel. auto or on would keep
ATS=off while it is currently set today, no?
Thanks
Eric
> (this also covers behaviour on older kernels)
>
> Does the above make sense?
>
> Thanks,
> Shameer
>
^ permalink raw reply [flat|nested] 68+ messages in thread
* RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-10 16:01 ` Eric Auger
@ 2026-02-10 16:08 ` Shameer Kolothum Thodi
0 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-02-10 16:08 UTC (permalink / raw)
To: eric.auger@redhat.com, Jason Gunthorpe
Cc: qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: 10 February 2026 16:01
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>
> Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
> peter.maydell@linaro.org; Nicolin Chen <nicolinc@nvidia.com>;
> ddutile@redhat.com; berrange@redhat.com; clg@redhat.com;
> alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> Krishnakant Jaju <kjaju@nvidia.com>
> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
> specify OAS bits
>
> External email: Use caution opening links or attachments
>
>
> Hi Shameer,
> On 2/10/26 4:12 PM, Shameer Kolothum Thodi wrote:
> > This is how the usage would look with an added "ats" property on the
> > vfio-pci device:
> >
> > -device vfio-pci,...,ats=on
> > if ATS_NOT_SUPPORTED then reject
> How do you distinguish an old kernel which is not capable of returning
> the cap info from a new kernel that effectively returns the capability
> is not supported.
For old kernel, ATS_NOT_SUPPORTED will not be set. So we keep ATS if
CAP is present in this case. But reject , if new kernels report
ATS_NOT_SUPPORTED
> > -device vfio-pci,...,ats=off
> > keep ATS off
> > -device vfio-pci,...,ats=auto (default)
> > if ATS_NOT_SUPPORTED, disable ATS
> > otherwise, enable ATS if the PCIe ATS capability is present
>
> I am not sure how it works with an old kernel. auto or on would keep
> ATS=off while it is currently set today, no?
Same as above, for old kernel ATS_NOT_SUPPORTED is not set. So
just rely on PCIe CAP presence just like it does today.
Thanks,
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread
* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-10 15:12 ` Shameer Kolothum Thodi
2026-02-10 16:01 ` Eric Auger
@ 2026-03-04 7:47 ` Eric Auger
2026-03-04 8:26 ` Shameer Kolothum Thodi
1 sibling, 1 reply; 68+ messages in thread
From: Eric Auger @ 2026-03-04 7:47 UTC (permalink / raw)
To: Shameer Kolothum Thodi, Jason Gunthorpe, Pavel Hrdina
Cc: qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
Hi Shameer,
On 2/10/26 4:12 PM, Shameer Kolothum Thodi wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Shameer Kolothum Thodi
>> Sent: 02 February 2026 16:04
>> To: Jason Gunthorpe <jgg@nvidia.com>
>> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
>> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
>> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
>> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
>> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
>> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
>> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
>> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
>> <kjaju@nvidia.com>
>> Subject: RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
>> specify OAS bits
>>
>>
>>
>>> -----Original Message-----
>>> From: Jason Gunthorpe <jgg@nvidia.com>
>>> Sent: 02 February 2026 16:00
>>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
>>> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
>>> devel@nongnu.org; peter.maydell@linaro.org; Nicolin Chen
>>> <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
>>> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
>>> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
>>> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
>>> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
>>> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
>>> <kjaju@nvidia.com>
>>> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
>>> specify OAS bits
>>>
>>> On Mon, Feb 02, 2026 at 03:38:50PM +0000, Shameer Kolothum Thodi
>>> wrote:
>>>
>>>>> We can treat ATS as a per-PCIe device property.. I think it would
>>>>> be fine to tell the SMMU that it always has ATS support, it will
>>>>> never do anything with it unless it sees a PCIe device with an ATS
>>>>> cap, and the physical STE generated by the hypervisor should sanitize the
>> EATS.
>>>>> BIOS overriding it should be reflected as the devices being
>>>>> reported as not supporting ATS, qemu should have a per-device flag
>>>>> to disable ATS.
>>>> Do we have way to detect that(IOMMU_FWSPEC_PCI_RC_ATS) from
>>>> userspace now?
>>> I don't think so.. I was describing how I suspect that should work.
>>>
>>> The iommu driver is the only entity that decides if ATS should be
>>> enabled per-device, so it should report back to userspace in iommufd
>>> if the device is allowed to enable ATS or not. That should roll up any
>>> FW overrides and the PCI cap block.
>> Right. May be just like the out_max_pasid_log2. Will take a look.
> Looking at adding the ATS detection support through iommufd, one
> idea is to extend the enum iommufd_hw_capabilities with
> IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED. The question I have
> now is whether we need to differentiate an old kernel which doesn't
> have this support or not.
>
> This is how the usage would look with an added "ats" property on the
> vfio-pci device:
>
> -device vfio-pci,...,ats=on
> if ATS_NOT_SUPPORTED then reject
> -device vfio-pci,...,ats=off
> keep ATS off
> -device vfio-pci,...,ats=auto (default)
> if ATS_NOT_SUPPORTED, disable ATS
> otherwise, enable ATS if the PCIe ATS capability is present
> (this also covers behaviour on older kernels)
>
> Does the above make sense?
Do you have plans wrt the migration to auto properties for the various
vSMMU properties (RIL, ssidsize, OAS, ATS)? I just would like to make
sure libvirt integrators take this new design into account.
Please let me know if some help is needed for the conversion of some of
the properties?
Thanks
Eric
>
> Thanks,
> Shameer
>
^ permalink raw reply [flat|nested] 68+ messages in thread
* RE: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-03-04 7:47 ` Eric Auger
@ 2026-03-04 8:26 ` Shameer Kolothum Thodi
2026-03-04 16:37 ` Eric Auger
0 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum Thodi @ 2026-03-04 8:26 UTC (permalink / raw)
To: eric.auger@redhat.com, Jason Gunthorpe, Pavel Hrdina
Cc: qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
Hi Eric,
> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: 04 March 2026 07:47
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>; Pavel Hrdina <phrdina@redhat.com>
> Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
> peter.maydell@linaro.org; Nicolin Chen <nicolinc@nvidia.com>;
> ddutile@redhat.com; berrange@redhat.com; clg@redhat.com;
> alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> Krishnakant Jaju <kjaju@nvidia.com>
> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify
> OAS bits
>
[...]
> >
> > This is how the usage would look with an added "ats" property on the
> > vfio-pci device:
> >
> > -device vfio-pci,...,ats=on
> > if ATS_NOT_SUPPORTED then reject
> > -device vfio-pci,...,ats=off
> > keep ATS off
> > -device vfio-pci,...,ats=auto (default)
> > if ATS_NOT_SUPPORTED, disable ATS
> > otherwise, enable ATS if the PCIe ATS capability is present
> > (this also covers behaviour on older kernels)
> >
> > Does the above make sense?
> Do you have plans wrt the migration to auto properties for the various
> vSMMU properties (RIL, ssidsize, OAS, ATS)? I just would like to make
> sure libvirt integrators take this new design into account.
> Please let me know if some help is needed for the conversion of some of
> the properties?
Yes, Nathan is working on a QEMU series to convert those to auto
properties. I have also sent out a kernel series to report the effective
ATS status for the device. Sorry, I missed CCing you on that thread.
Please find it here [0]. The QEMU series needs to make use of this
as well.
Hopefully, Nathan will be able to send out the QEMU series soon after
internal review and testing.
Thanks,
Shameer
[0] https://lore.kernel.org/linux-iommu/20260303150348.233997-1-skolothumtho@nvidia.com/
^ permalink raw reply [flat|nested] 68+ messages in thread
* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-03-04 8:26 ` Shameer Kolothum Thodi
@ 2026-03-04 16:37 ` Eric Auger
0 siblings, 0 replies; 68+ messages in thread
From: Eric Auger @ 2026-03-04 16:37 UTC (permalink / raw)
To: Shameer Kolothum Thodi, Jason Gunthorpe, Pavel Hrdina
Cc: qemu-arm@nongnu.org, qemu-devel@nongnu.org,
peter.maydell@linaro.org, Nicolin Chen, ddutile@redhat.com,
berrange@redhat.com, clg@redhat.com, alex@shazbot.org,
Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
Hi Shameer,
On 3/4/26 9:26 AM, Shameer Kolothum Thodi wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: 04 March 2026 07:47
>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; Pavel Hrdina <phrdina@redhat.com>
>> Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
>> peter.maydell@linaro.org; Nicolin Chen <nicolinc@nvidia.com>;
>> ddutile@redhat.com; berrange@redhat.com; clg@redhat.com;
>> alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
>> <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
>> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
>> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
>> Krishnakant Jaju <kjaju@nvidia.com>
>> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify
>> OAS bits
>>
> [...]
>>> This is how the usage would look with an added "ats" property on the
>>> vfio-pci device:
>>>
>>> -device vfio-pci,...,ats=on
>>> if ATS_NOT_SUPPORTED then reject
>>> -device vfio-pci,...,ats=off
>>> keep ATS off
>>> -device vfio-pci,...,ats=auto (default)
>>> if ATS_NOT_SUPPORTED, disable ATS
>>> otherwise, enable ATS if the PCIe ATS capability is present
>>> (this also covers behaviour on older kernels)
>>>
>>> Does the above make sense?
>> Do you have plans wrt the migration to auto properties for the various
>> vSMMU properties (RIL, ssidsize, OAS, ATS)? I just would like to make
>> sure libvirt integrators take this new design into account.
>> Please let me know if some help is needed for the conversion of some of
>> the properties?
> Yes, Nathan is working on a QEMU series to convert those to auto
> properties. I have also sent out a kernel series to report the effective
> ATS status for the device. Sorry, I missed CCing you on that thread.
> Please find it here [0]. The QEMU series needs to make use of this
> as well.
>
> Hopefully, Nathan will be able to send out the QEMU series soon after
> internal review and testing.
>
> Thanks,
> Shameer
> [0] https://lore.kernel.org/linux-iommu/20260303150348.233997-1-skolothumtho@nvidia.com/
>
Thank you for the update & pointers!
Eric
>
^ permalink raw reply [flat|nested] 68+ messages in thread
* Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits
2026-02-02 15:11 ` Shameer Kolothum Thodi
2026-02-02 15:19 ` Jason Gunthorpe
@ 2026-02-02 15:29 ` Eric Auger
1 sibling, 0 replies; 68+ messages in thread
From: Eric Auger @ 2026-02-02 15:29 UTC (permalink / raw)
To: Shameer Kolothum Thodi, qemu-arm@nongnu.org,
qemu-devel@nongnu.org
Cc: peter.maydell@linaro.org, Jason Gunthorpe, Nicolin Chen,
ddutile@redhat.com, berrange@redhat.com, clg@redhat.com,
alex@shazbot.org, Nathan Chen, Matt Ochs, smostafa@google.com,
wangzhou1@hisilicon.com, jiangkunkun@huawei.com,
jonathan.cameron@huawei.com, zhangfei.gao@linaro.org,
zhenzhong.duan@intel.com, yi.l.liu@intel.com, Krishnakant Jaju
On 2/2/26 4:11 PM, Shameer Kolothum Thodi wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: 02 February 2026 14:40
>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
>> arm@nongnu.org; qemu-devel@nongnu.org
>> Cc: peter.maydell@linaro.org; Jason Gunthorpe <jgg@nvidia.com>; Nicolin
>> Chen <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
>> clg@redhat.com; alex@shazbot.org; Nathan Chen <nathanc@nvidia.com>;
>> Matt Ochs <mochs@nvidia.com>; smostafa@google.com;
>> wangzhou1@hisilicon.com; jiangkunkun@huawei.com;
>> jonathan.cameron@huawei.com; zhangfei.gao@linaro.org;
>> zhenzhong.duan@intel.com; yi.l.liu@intel.com; Krishnakant Jaju
>> <kjaju@nvidia.com>
>> Subject: Re: [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to
>> specify OAS bits
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Hi Shameer, Nicolin,
>> On 1/26/26 11:43 AM, Shameer Kolothum wrote:
>>> QEMU SMMUv3 currently sets the output address size (OAS) to 44 bits.
>>> With accelerator mode enabled, a device may use SVA, where CPU page
>> tables
>>> are shared with the SMMU, requiring an OAS at least as large as the
>>> CPU’s output address size. A user option is added to configure this.
>>>
>>> However, the OAS value advertised by the virtual SMMU must remain
>>> compatible with the capabilities of the host SMMUv3. In accelerated
>>> mode, the host SMMU performs stage-2 translation and must be able to
>>> consume the intermediate physical addresses (IPA) produced by stage-1.
>>>
>>> The OAS exposed by the virtual SMMU defines the maximum IPA width that
>>> stage-1 translations may generate. For AArch64 implementations, the
>>> maximum usable IPA size on the host SMMU is determined by its own OAS.
>>> Check that the configured OAS does not exceed what the host SMMU
>>> can safely support.
>> After discussion with Kubevirt guys, the management of current RIL,
>> ssidsize, ats and oas options look touchy because it is tricky for them
>> to introspect the host values.
> I may be wrong, but it looks like Kubevirt actually makes use of libvirt. Or
> is it independent?
yes they do? The problem is they don't know how to set the values for
those props besides definiting a kind of hardcoded profile. They are
unused to do that. The best would be to let qemu guess the right values
by introspection through the kernel uapi.
>
> The reason I am asking is Nathan is already working on the libvirt changes
> for this here:
> https://lists.libvirt.org/archives/list/devel@lists.libvirt.org/thread/5GG76AQTTDUHW5KRANPY3QUII4ZOEYRP/
>
> And the _AUTO suggestion might impact that as well.
sure. That's why I discuss it tight now.
>
>> In cold plug case at least it looks feasible to use IOMMU_GET_HW_INFO()
>> to retrieve host info:
>>
>> RIL is in IDR3
>> ssidsize in IDR1
>> OAS in IDR5
>> ATS may be more touchy but maybe this can be introspected too?
> Yeah. ATS might require some kernel plumbing as BIOS can override it.
agreed
>
>> I would advocate to turn those options into _AUTO options to give a
>> chance to the user to ask for host derived values.
>>
>> Currently in include/hw/qdev-properties.h we have
>> DEFINE_PROP_ON_OFF_AUTO for a bool and
>> DEFINE_PROP_ON_OFF_AUTO_BIT64 for a 64b
>>
>> RIL can match bool.
>> Others may need a new DEFINE_PROP_ON_OFF_AUTO_* one.
>>
>> Note such kind of auto property was introduced for virtio-iommu
>> (DEFINE_PROP_GRANULE_MODE)
>>
>> What do you think? Most probably this has been dicussed in the past but
>> I do not necessarily remember the outputs.
> IIRC, the only conclusion was that the user has to specify the SMMUv3
> parameters. Don't think the _AUTO option was discussed previously.
> If this is a very useful thing to have, I can take a look.
If this sounds feasible, I think this would be a more relevant approach.
The default values could be kept as is for now and we would let kubevirt
set AUTO value until the default gets changed.
Eric
>
> Thanks,
> Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 32/37] backends/iommufd: Retrieve PASID width from iommufd_backend_get_device_info()
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (30 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 31/37] hw/arm/smmuv3-accel: Add property to specify OAS bits Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 33/37] backends/iommufd: Add get_pasid_info() callback Shameer Kolothum
` (5 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Retrieve PASID width from iommufd_backend_get_device_info() and store it
in HostIOMMUDeviceCaps for later use.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
backends/iommufd.c | 6 +++++-
hw/arm/smmuv3-accel.c | 3 ++-
hw/vfio/iommufd.c | 6 ++++--
include/system/host_iommu_device.h | 3 +++
include/system/iommufd.h | 3 ++-
5 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index e3a3c1480e..9b63d74083 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -389,7 +389,8 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be,
bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
uint32_t *type, void *data, uint32_t len,
- uint64_t *caps, Error **errp)
+ uint64_t *caps, uint8_t *max_pasid_log2,
+ Error **errp)
{
struct iommu_hw_info info = {
.size = sizeof(info),
@@ -408,6 +409,9 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
g_assert(caps);
*caps = info.out_capabilities;
+ if (max_pasid_log2) {
+ *max_pasid_log2 = info.out_max_pasid_log2;
+ }
return true;
}
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index ea420afeb7..342944da23 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -121,7 +121,8 @@ smmuv3_accel_hw_compatible(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
uint64_t caps;
if (!iommufd_backend_get_device_info(idev->iommufd, idev->devid, &data_type,
- &info, sizeof(info), &caps, errp)) {
+ &info, sizeof(info), &caps, NULL,
+ errp)) {
return false;
}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 2947e1b80f..131612eb83 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -398,7 +398,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
*/
if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
&type, &caps, sizeof(caps), &hw_caps,
- errp)) {
+ NULL, errp)) {
return false;
}
@@ -939,19 +939,21 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
HostIOMMUDeviceCaps *caps = &hiod->caps;
VendorCaps *vendor_caps = &caps->vendor_caps;
enum iommu_hw_info_type type;
+ uint8_t max_pasid_log2;
uint64_t hw_caps;
hiod->agent = opaque;
if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type,
vendor_caps, sizeof(*vendor_caps),
- &hw_caps, errp)) {
+ &hw_caps, &max_pasid_log2, errp)) {
return false;
}
hiod->name = g_strdup(vdev->name);
caps->type = type;
caps->hw_caps = hw_caps;
+ caps->max_pasid_log2 = max_pasid_log2;
idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
idev->iommufd = vdev->iommufd;
diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h
index ab849a4a82..bfb2b60478 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -30,6 +30,8 @@ typedef union VendorCaps {
* @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents
* the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl)
*
+ * @max_pasid_log2: width of PASIDs supported by host IOMMU device
+ *
* @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on
* IOMMUFD this represents a user-space buffer filled by kernel
* with host IOMMU @type specific hardware information data)
@@ -37,6 +39,7 @@ typedef union VendorCaps {
typedef struct HostIOMMUDeviceCaps {
uint32_t type;
uint64_t hw_caps;
+ uint8_t max_pasid_log2;
VendorCaps vendor_caps;
} HostIOMMUDeviceCaps;
#endif
diff --git a/include/system/iommufd.h b/include/system/iommufd.h
index 567dfb7b1d..80d72469a9 100644
--- a/include/system/iommufd.h
+++ b/include/system/iommufd.h
@@ -71,7 +71,8 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, uint64_t size);
bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
uint32_t *type, void *data, uint32_t len,
- uint64_t *caps, Error **errp);
+ uint64_t *caps, uint8_t *max_pasid_log2,
+ Error **errp);
bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
uint32_t pt_id, uint32_t flags,
uint32_t data_type, uint32_t data_len,
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 32/37] backends/iommufd: Retrieve PASID width from iommufd_backend_get_device_info()
2026-01-26 10:43 ` [PATCH v9 32/37] backends/iommufd: Retrieve PASID width from iommufd_backend_get_device_info() Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju
On 2026/1/26 18:43, Shameer Kolothum wrote:
> Retrieve PASID width from iommufd_backend_get_device_info() and store it
> in HostIOMMUDeviceCaps for later use.
>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> Reviewed-by: Cédric Le Goater <clg@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> backends/iommufd.c | 6 +++++-
> hw/arm/smmuv3-accel.c | 3 ++-
> hw/vfio/iommufd.c | 6 ++++--
> include/system/host_iommu_device.h | 3 +++
> include/system/iommufd.h | 3 ++-
> 5 files changed, 16 insertions(+), 5 deletions(-)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index e3a3c1480e..9b63d74083 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -389,7 +389,8 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be,
>
> bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
> uint32_t *type, void *data, uint32_t len,
> - uint64_t *caps, Error **errp)
> + uint64_t *caps, uint8_t *max_pasid_log2,
> + Error **errp)
> {
> struct iommu_hw_info info = {
> .size = sizeof(info),
> @@ -408,6 +409,9 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
> g_assert(caps);
> *caps = info.out_capabilities;
>
> + if (max_pasid_log2) {
> + *max_pasid_log2 = info.out_max_pasid_log2;
> + }
> return true;
> }
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index ea420afeb7..342944da23 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -121,7 +121,8 @@ smmuv3_accel_hw_compatible(SMMUv3State *s, HostIOMMUDeviceIOMMUFD *idev,
> uint64_t caps;
>
> if (!iommufd_backend_get_device_info(idev->iommufd, idev->devid, &data_type,
> - &info, sizeof(info), &caps, errp)) {
> + &info, sizeof(info), &caps, NULL,
> + errp)) {
> return false;
> }
>
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 2947e1b80f..131612eb83 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -398,7 +398,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> */
> if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
> &type, &caps, sizeof(caps), &hw_caps,
> - errp)) {
> + NULL, errp)) {
> return false;
> }
>
> @@ -939,19 +939,21 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
> HostIOMMUDeviceCaps *caps = &hiod->caps;
> VendorCaps *vendor_caps = &caps->vendor_caps;
> enum iommu_hw_info_type type;
> + uint8_t max_pasid_log2;
> uint64_t hw_caps;
>
> hiod->agent = opaque;
>
> if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type,
> vendor_caps, sizeof(*vendor_caps),
> - &hw_caps, errp)) {
> + &hw_caps, &max_pasid_log2, errp)) {
> return false;
> }
>
> hiod->name = g_strdup(vdev->name);
> caps->type = type;
> caps->hw_caps = hw_caps;
> + caps->max_pasid_log2 = max_pasid_log2;
>
> idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
> idev->iommufd = vdev->iommufd;
> diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h
> index ab849a4a82..bfb2b60478 100644
> --- a/include/system/host_iommu_device.h
> +++ b/include/system/host_iommu_device.h
> @@ -30,6 +30,8 @@ typedef union VendorCaps {
> * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents
> * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl)
> *
> + * @max_pasid_log2: width of PASIDs supported by host IOMMU device
> + *
> * @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on
> * IOMMUFD this represents a user-space buffer filled by kernel
> * with host IOMMU @type specific hardware information data)
> @@ -37,6 +39,7 @@ typedef union VendorCaps {
> typedef struct HostIOMMUDeviceCaps {
> uint32_t type;
> uint64_t hw_caps;
> + uint8_t max_pasid_log2;
> VendorCaps vendor_caps;
> } HostIOMMUDeviceCaps;
> #endif
> diff --git a/include/system/iommufd.h b/include/system/iommufd.h
> index 567dfb7b1d..80d72469a9 100644
> --- a/include/system/iommufd.h
> +++ b/include/system/iommufd.h
> @@ -71,7 +71,8 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
> hwaddr iova, uint64_t size);
> bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
> uint32_t *type, void *data, uint32_t len,
> - uint64_t *caps, Error **errp);
> + uint64_t *caps, uint8_t *max_pasid_log2,
> + Error **errp);
> bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
> uint32_t pt_id, uint32_t flags,
> uint32_t data_type, uint32_t data_len,
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 33/37] backends/iommufd: Add get_pasid_info() callback
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (31 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 32/37] backends/iommufd: Retrieve PASID width from iommufd_backend_get_device_info() Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:06 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset Shameer Kolothum
` (4 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
The get_pasid_info callback retrieves PASID capability information
when the HostIOMMUDevice backend supports it. Currently, only the
Linux IOMMUFD backend provides this information.
This will be used by a subsequent patch to synthesize a PASID
capability for vfio-pci devices behind a vIOMMU that supports PASID.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
backends/iommufd.c | 17 +++++++++++++++++
include/system/host_iommu_device.h | 17 +++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 9b63d74083..13822df82f 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -539,11 +539,28 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
}
}
+static bool hiod_iommufd_get_pasid_info(HostIOMMUDevice *hiod,
+ PasidInfo *pasid_info)
+{
+ HostIOMMUDeviceCaps *caps = &hiod->caps;
+
+ if (!caps->max_pasid_log2) {
+ return false;
+ }
+
+ g_assert(pasid_info);
+ pasid_info->exec_perm = (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC);
+ pasid_info->priv_mod = (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV);
+ pasid_info->max_pasid_log2 = caps->max_pasid_log2;
+ return true;
+}
+
static void hiod_iommufd_class_init(ObjectClass *oc, const void *data)
{
HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
hioc->get_cap = hiod_iommufd_get_cap;
+ hioc->get_pasid_info = hiod_iommufd_get_pasid_info;
};
static const TypeInfo types[] = {
diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h
index bfb2b60478..f000301583 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -59,6 +59,12 @@ struct HostIOMMUDevice {
#endif
};
+typedef struct PasidInfo {
+ bool exec_perm;
+ bool priv_mod;
+ uint8_t max_pasid_log2;
+} PasidInfo;
+
/**
* struct HostIOMMUDeviceClass - The base class for all host IOMMU devices.
*
@@ -116,6 +122,17 @@ struct HostIOMMUDeviceClass {
* @hiod: handle to the host IOMMU device
*/
uint64_t (*get_page_size_mask)(HostIOMMUDevice *hiod);
+ /**
+ * @get_pasid_info: Return the PASID information associated with the
+ * @hiod Host IOMMU device.
+ *
+ * @hiod: handle to the host IOMMU device
+ *
+ * @pasid_info: If success, returns the PASID related information.
+ *
+ * Returns: true on success, false on failure.
+ */
+ bool (*get_pasid_info)(HostIOMMUDevice *hiod, PasidInfo *pasid_info);
};
/*
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 33/37] backends/iommufd: Add get_pasid_info() callback
2026-01-26 10:43 ` [PATCH v9 33/37] backends/iommufd: Add get_pasid_info() callback Shameer Kolothum
@ 2026-01-26 12:06 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:06 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju
On 2026/1/26 18:43, Shameer Kolothum wrote:
> The get_pasid_info callback retrieves PASID capability information
> when the HostIOMMUDevice backend supports it. Currently, only the
> Linux IOMMUFD backend provides this information.
>
> This will be used by a subsequent patch to synthesize a PASID
> capability for vfio-pci devices behind a vIOMMU that supports PASID.
>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> backends/iommufd.c | 17 +++++++++++++++++
> include/system/host_iommu_device.h | 17 +++++++++++++++++
> 2 files changed, 34 insertions(+)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index 9b63d74083..13822df82f 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -539,11 +539,28 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
> }
> }
>
> +static bool hiod_iommufd_get_pasid_info(HostIOMMUDevice *hiod,
> + PasidInfo *pasid_info)
> +{
> + HostIOMMUDeviceCaps *caps = &hiod->caps;
> +
> + if (!caps->max_pasid_log2) {
> + return false;
> + }
> +
> + g_assert(pasid_info);
> + pasid_info->exec_perm = (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC);
> + pasid_info->priv_mod = (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV);
> + pasid_info->max_pasid_log2 = caps->max_pasid_log2;
> + return true;
> +}
> +
> static void hiod_iommufd_class_init(ObjectClass *oc, const void *data)
> {
> HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
>
> hioc->get_cap = hiod_iommufd_get_cap;
> + hioc->get_pasid_info = hiod_iommufd_get_pasid_info;
> };
>
> static const TypeInfo types[] = {
> diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h
> index bfb2b60478..f000301583 100644
> --- a/include/system/host_iommu_device.h
> +++ b/include/system/host_iommu_device.h
> @@ -59,6 +59,12 @@ struct HostIOMMUDevice {
> #endif
> };
>
> +typedef struct PasidInfo {
> + bool exec_perm;
> + bool priv_mod;
> + uint8_t max_pasid_log2;
> +} PasidInfo;
> +
> /**
> * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices.
> *
> @@ -116,6 +122,17 @@ struct HostIOMMUDeviceClass {
> * @hiod: handle to the host IOMMU device
> */
> uint64_t (*get_page_size_mask)(HostIOMMUDevice *hiod);
> + /**
> + * @get_pasid_info: Return the PASID information associated with the
> + * @hiod Host IOMMU device.
> + *
> + * @hiod: handle to the host IOMMU device
> + *
> + * @pasid_info: If success, returns the PASID related information.
> + *
> + * Returns: true on success, false on failure.
> + */
> + bool (*get_pasid_info)(HostIOMMUDevice *hiod, PasidInfo *pasid_info);
> };
>
> /*
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (32 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 33/37] backends/iommufd: Add get_pasid_info() callback Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:07 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 35/37] hw/pci: Factor out common PASID capability initialization Shameer Kolothum
` (3 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Michael S . Tsirkin
Add pcie_insert_capability(), a helper to insert a PCIe extended
capability into an existing extended capability list at a caller
specified offset.
Unlike pcie_add_capability(), which always appends a capability to the
end of the list, this helper preserves the existing list ordering while
allowing insertion at an arbitrary offset.
The helper only validates that the insertion does not overwrite an
existing PCIe extended capability header, since corrupting a header
would break the extended capability linked list. Validation of overlaps
with other configuration space registers or capability-specific
register blocks is left to the caller.
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci/pcie.c | 69 +++++++++++++++++++++++++++++++++++++++++++
include/hw/pci/pcie.h | 2 ++
2 files changed, 71 insertions(+)
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index b302de6419..aa9024e532 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1050,6 +1050,75 @@ static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t next)
pci_set_long(dev->config + pos, header);
}
+/*
+ * Insert a PCIe extended capability at a given offset.
+ *
+ * This helper only validates that the insertion does not overwrite an
+ * existing PCIe extended capability header, as corrupting a header would
+ * break the extended capability linked list.
+ *
+ * The caller must ensure that (offset, size) does not overlap with other
+ * registers or capability-specific register blocks. Overlaps with
+ * capability-specific registers are not checked and are considered a
+ * user-controlled override.
+ *
+ * Note: Best effort helper. The PCIe spec does not require extended
+ * capabilities to be ordered, but most devices use a forward-linked list.
+ * Devices that do not consistently use a forward-linked list may cause
+ * insertion to fail.
+ */
+bool pcie_insert_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver,
+ uint16_t offset, uint16_t size)
+{
+ uint16_t pos = PCI_CONFIG_SPACE_SIZE, prev = 0;
+ uint32_t header;
+
+ assert(pci_is_express(dev));
+
+ if (!QEMU_IS_ALIGNED(offset, PCI_EXT_CAP_ALIGN) ||
+ size < 8 ||
+ offset < PCI_CONFIG_SPACE_SIZE ||
+ offset >= PCIE_CONFIG_SPACE_SIZE ||
+ offset + size > PCIE_CONFIG_SPACE_SIZE) {
+ return false;
+ }
+
+ header = pci_get_long(dev->config + pos);
+ if (!header) {
+ /* No extended capability present, insertion must be at the ECAP head */
+ if (offset != pos) {
+ return false;
+ }
+ pci_set_long(dev->config + pos, PCI_EXT_CAP(cap_id, cap_ver, 0));
+ goto out;
+ }
+
+ while (header && pos && offset >= pos) {
+ uint16_t next = PCI_EXT_CAP_NEXT(header);
+
+ /* Reject insertion inside an existing ECAP header (4 bytes) */
+ if (offset < pos + PCI_EXT_CAP_ALIGN) {
+ return false;
+ }
+
+ prev = pos;
+ pos = next;
+ header = pos ? pci_get_long(dev->config + pos) : 0;
+ }
+
+ pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, pos));
+ if (prev) {
+ pcie_ext_cap_set_next(dev, prev, offset);
+ }
+
+out:
+ /* Make capability read-only by default */
+ memset(dev->wmask + offset, 0, size);
+ memset(dev->w1cmask + offset, 0, size);
+ /* Check capability by default */
+ memset(dev->cmask + offset, 0xFF, size);
+ return true;
+}
/*
* Caller must supply valid (offset, size) such that the range wouldn't
* overlap with other capability or other registers.
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index c880ae1e04..d68bfa6257 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -133,6 +133,8 @@ uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id);
void pcie_add_capability(PCIDevice *dev,
uint16_t cap_id, uint8_t cap_ver,
uint16_t offset, uint16_t size);
+bool pcie_insert_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver,
+ uint16_t offset, uint16_t size);
void pcie_sync_bridge_lnk(PCIDevice *dev);
void pcie_acs_init(PCIDevice *dev, uint16_t offset);
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset
2026-01-26 10:43 ` [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset Shameer Kolothum
@ 2026-01-26 12:07 ` Yi Liu
2026-01-26 14:17 ` Shameer Kolothum
0 siblings, 1 reply; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:07 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju,
Michael S . Tsirkin
On 2026/1/26 18:43, Shameer Kolothum wrote:
> Add pcie_insert_capability(), a helper to insert a PCIe extended
> capability into an existing extended capability list at a caller
> specified offset.
>
> Unlike pcie_add_capability(), which always appends a capability to the
> end of the list, this helper preserves the existing list ordering while
> allowing insertion at an arbitrary offset.
>
> The helper only validates that the insertion does not overwrite an
> existing PCIe extended capability header, since corrupting a header
> would break the extended capability linked list. Validation of overlaps
> with other configuration space registers or capability-specific
> register blocks is left to the caller.
>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/pci/pcie.c | 69 +++++++++++++++++++++++++++++++++++++++++++
> include/hw/pci/pcie.h | 2 ++
> 2 files changed, 71 insertions(+)
>
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index b302de6419..aa9024e532 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -1050,6 +1050,75 @@ static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t next)
> pci_set_long(dev->config + pos, header);
> }
>
> +/*
> + * Insert a PCIe extended capability at a given offset.
> + *
> + * This helper only validates that the insertion does not overwrite an
> + * existing PCIe extended capability header, as corrupting a header would
> + * break the extended capability linked list.
> + *
> + * The caller must ensure that (offset, size) does not overlap with other
> + * registers or capability-specific register blocks. Overlaps with
> + * capability-specific registers are not checked and are considered a
> + * user-controlled override.
> + *
> + * Note: Best effort helper. The PCIe spec does not require extended
> + * capabilities to be ordered, but most devices use a forward-linked list.
> + * Devices that do not consistently use a forward-linked list may cause
> + * insertion to fail.
> + */
> +bool pcie_insert_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver,
> + uint16_t offset, uint16_t size)
> +{
> + uint16_t pos = PCI_CONFIG_SPACE_SIZE, prev = 0;
> + uint32_t header;
> +
> + assert(pci_is_express(dev));
> +
> + if (!QEMU_IS_ALIGNED(offset, PCI_EXT_CAP_ALIGN) ||
> + size < 8 ||
> + offset < PCI_CONFIG_SPACE_SIZE ||
> + offset >= PCIE_CONFIG_SPACE_SIZE ||
> + offset + size > PCIE_CONFIG_SPACE_SIZE) {
> + return false;
> + }
> +
> + header = pci_get_long(dev->config + pos);
> + if (!header) {
> + /* No extended capability present, insertion must be at the ECAP head */
> + if (offset != pos) {
> + return false;
> + }
> + pci_set_long(dev->config + pos, PCI_EXT_CAP(cap_id, cap_ver, 0));
> + goto out;
> + }
> +
> + while (header && pos && offset >= pos) {
> + uint16_t next = PCI_EXT_CAP_NEXT(header);
> +
> + /* Reject insertion inside an existing ECAP header (4 bytes) */
> + if (offset < pos + PCI_EXT_CAP_ALIGN) {
> + return false;
> + }
TBH. I was expecting to see a table that is similar with
pci_ext_cap_length[][1], and a helper to walk the ext cap list
to figure out the spared pos. But it might be over-enginering as
we rely more on user to give an offset that is for sure no conflict
with existing ecaps nor hidden registers. Could you also add a comment
in the code to note it?
[1]
https://github.com/torvalds/linux/blob/63804fed149a6750ffd28610c5c1c98cce6bd377/drivers/vfio/pci/vfio_pci_config.c#L71
With the above nit, LGTM.
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> + prev = pos;
> + pos = next;
> + header = pos ? pci_get_long(dev->config + pos) : 0;
> + }
> +
> + pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, pos));
> + if (prev) {
> + pcie_ext_cap_set_next(dev, prev, offset);
> + }
> +
> +out:
> + /* Make capability read-only by default */
> + memset(dev->wmask + offset, 0, size);
> + memset(dev->w1cmask + offset, 0, size);
> + /* Check capability by default */
> + memset(dev->cmask + offset, 0xFF, size);
> + return true;
> +}
> /*
> * Caller must supply valid (offset, size) such that the range wouldn't
> * overlap with other capability or other registers.
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index c880ae1e04..d68bfa6257 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -133,6 +133,8 @@ uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id);
> void pcie_add_capability(PCIDevice *dev,
> uint16_t cap_id, uint8_t cap_ver,
> uint16_t offset, uint16_t size);
> +bool pcie_insert_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver,
> + uint16_t offset, uint16_t size);
> void pcie_sync_bridge_lnk(PCIDevice *dev);
>
> void pcie_acs_init(PCIDevice *dev, uint16_t offset);
^ permalink raw reply [flat|nested] 68+ messages in thread* RE: [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset
2026-01-26 12:07 ` Yi Liu
@ 2026-01-26 14:17 ` Shameer Kolothum
2026-01-27 3:03 ` Yi Liu
0 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 14:17 UTC (permalink / raw)
To: Yi Liu, qemu-arm@nongnu.org, qemu-devel@nongnu.org
Cc: eric.auger@redhat.com, peter.maydell@linaro.org, Jason Gunthorpe,
Nicolin Chen, ddutile@redhat.com, berrange@redhat.com,
clg@redhat.com, alex@shazbot.org, Nathan Chen, Matt Ochs,
smostafa@google.com, wangzhou1@hisilicon.com,
jiangkunkun@huawei.com, jonathan.cameron@huawei.com,
zhangfei.gao@linaro.org, zhenzhong.duan@intel.com,
Krishnakant Jaju, Michael S . Tsirkin
Hi Yi,
> -----Original Message-----
> From: Yi Liu <yi.l.liu@intel.com>
> Sent: 26 January 2026 12:08
> To: Shameer Kolothum <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
> berrange@redhat.com; clg@redhat.com; alex@shazbot.org; Nathan Chen
> <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
> smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; Krishnakant Jaju
> <kjaju@nvidia.com>; Michael S . Tsirkin <mst@redhat.com>
> Subject: Re: [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended
> capability at a fixed offset
>
> External email: Use caution opening links or attachments
>
>
> On 2026/1/26 18:43, Shameer Kolothum wrote:
> > Add pcie_insert_capability(), a helper to insert a PCIe extended
> > capability into an existing extended capability list at a caller
> > specified offset.
> >
> > Unlike pcie_add_capability(), which always appends a capability to the
> > end of the list, this helper preserves the existing list ordering while
> > allowing insertion at an arbitrary offset.
> >
> > The helper only validates that the insertion does not overwrite an
> > existing PCIe extended capability header, since corrupting a header
> > would break the extended capability linked list. Validation of overlaps
> > with other configuration space registers or capability-specific
> > register blocks is left to the caller.
> >
> > Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > Reviewed-by: Eric Auger <eric.auger@redhat.com>
> > Tested-by: Eric Auger <eric.auger@redhat.com>
> > Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> > Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> > ---
> > hw/pci/pcie.c | 69
> +++++++++++++++++++++++++++++++++++++++++++
> > include/hw/pci/pcie.h | 2 ++
> > 2 files changed, 71 insertions(+)
> >
> > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > index b302de6419..aa9024e532 100644
> > --- a/hw/pci/pcie.c
> > +++ b/hw/pci/pcie.c
> > @@ -1050,6 +1050,75 @@ static void pcie_ext_cap_set_next(PCIDevice
> *dev, uint16_t pos, uint16_t next)
> > pci_set_long(dev->config + pos, header);
> > }
> >
> > +/*
> > + * Insert a PCIe extended capability at a given offset.
> > + *
> > + * This helper only validates that the insertion does not overwrite an
> > + * existing PCIe extended capability header, as corrupting a header would
> > + * break the extended capability linked list.
> > + *
> > + * The caller must ensure that (offset, size) does not overlap with other
> > + * registers or capability-specific register blocks. Overlaps with
> > + * capability-specific registers are not checked and are considered a
> > + * user-controlled override.
> > + *
> > + * Note: Best effort helper. The PCIe spec does not require extended
> > + * capabilities to be ordered, but most devices use a forward-linked list.
> > + * Devices that do not consistently use a forward-linked list may cause
> > + * insertion to fail.
> > + */
> > +bool pcie_insert_capability(PCIDevice *dev, uint16_t cap_id, uint8_t
> cap_ver,
> > + uint16_t offset, uint16_t size)
> > +{
> > + uint16_t pos = PCI_CONFIG_SPACE_SIZE, prev = 0;
> > + uint32_t header;
> > +
> > + assert(pci_is_express(dev));
> > +
> > + if (!QEMU_IS_ALIGNED(offset, PCI_EXT_CAP_ALIGN) ||
> > + size < 8 ||
> > + offset < PCI_CONFIG_SPACE_SIZE ||
> > + offset >= PCIE_CONFIG_SPACE_SIZE ||
> > + offset + size > PCIE_CONFIG_SPACE_SIZE) {
> > + return false;
> > + }
> > +
> > + header = pci_get_long(dev->config + pos);
> > + if (!header) {
> > + /* No extended capability present, insertion must be at the ECAP head
> */
> > + if (offset != pos) {
> > + return false;
> > + }
> > + pci_set_long(dev->config + pos, PCI_EXT_CAP(cap_id, cap_ver, 0));
> > + goto out;
> > + }
> > +
> > + while (header && pos && offset >= pos) {
> > + uint16_t next = PCI_EXT_CAP_NEXT(header);
> > +
> > + /* Reject insertion inside an existing ECAP header (4 bytes) */
> > + if (offset < pos + PCI_EXT_CAP_ALIGN) {
> > + return false;
> > + }
>
> TBH. I was expecting to see a table that is similar with
> pci_ext_cap_length[][1], and a helper to walk the ext cap list
> to figure out the spared pos. But it might be over-enginering as
> we rely more on user to give an offset that is for sure no conflict
> with existing ecaps nor hidden registers. Could you also add a comment
> in the code to note it?
>
> [1]
> https://github.com/torvalds/linux/blob/63804fed149a6750ffd28610c5c1c9
> 8cce6bd377/drivers/vfio/pci/vfio_pci_config.c#L71
>
> With the above nit, LGTM.
>
> Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Thanks.
Yes, the helper depends on the caller to provide a suitable offset, since
it's not generally possible to validate this due to hidden registers and
device specific capability layouts, and this is already noted in the commit
log and function comment.
If you had a specific wording in mind that would make this clearer, please
let me know and I can incorporate it if this requires a respin.
Shameer
^ permalink raw reply [flat|nested] 68+ messages in thread* Re: [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset
2026-01-26 14:17 ` Shameer Kolothum
@ 2026-01-27 3:03 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-27 3:03 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm@nongnu.org, qemu-devel@nongnu.org
Cc: eric.auger@redhat.com, peter.maydell@linaro.org, Jason Gunthorpe,
Nicolin Chen, ddutile@redhat.com, berrange@redhat.com,
clg@redhat.com, alex@shazbot.org, Nathan Chen, Matt Ochs,
smostafa@google.com, wangzhou1@hisilicon.com,
jiangkunkun@huawei.com, jonathan.cameron@huawei.com,
zhangfei.gao@linaro.org, zhenzhong.duan@intel.com,
Krishnakant Jaju, Michael S . Tsirkin
On 2026/1/26 22:17, Shameer Kolothum wrote:
> Hi Yi,
>
>>> +/*
>>> + * Insert a PCIe extended capability at a given offset.
>>> + *
>>> + * This helper only validates that the insertion does not overwrite an
>>> + * existing PCIe extended capability header, as corrupting a header would
>>> + * break the extended capability linked list.
>>> + *
>>> + * The caller must ensure that (offset, size) does not overlap with other
>>> + * registers or capability-specific register blocks. Overlaps with
>>> + * capability-specific registers are not checked and are considered a
>>> + * user-controlled override.
>>> + *
>>> + * Note: Best effort helper. The PCIe spec does not require extended
>>> + * capabilities to be ordered, but most devices use a forward-linked list.
>>> + * Devices that do not consistently use a forward-linked list may cause
>>> + * insertion to fail.
>>> + */
>>> +bool pcie_insert_capability(PCIDevice *dev, uint16_t cap_id, uint8_t
>> cap_ver,
>>> + uint16_t offset, uint16_t size)
>>> +{
>>> + uint16_t pos = PCI_CONFIG_SPACE_SIZE, prev = 0;
>>> + uint32_t header;
>>> +
>>> + assert(pci_is_express(dev));
>>> +
>>> + if (!QEMU_IS_ALIGNED(offset, PCI_EXT_CAP_ALIGN) ||
>>> + size < 8 ||
>>> + offset < PCI_CONFIG_SPACE_SIZE ||
>>> + offset >= PCIE_CONFIG_SPACE_SIZE ||
>>> + offset + size > PCIE_CONFIG_SPACE_SIZE) {
>>> + return false;
>>> + }
>>> +
>>> + header = pci_get_long(dev->config + pos);
>>> + if (!header) {
>>> + /* No extended capability present, insertion must be at the ECAP head
>> */
>>> + if (offset != pos) {
>>> + return false;
>>> + }
>>> + pci_set_long(dev->config + pos, PCI_EXT_CAP(cap_id, cap_ver, 0));
>>> + goto out;
>>> + }
>>> +
>>> + while (header && pos && offset >= pos) {
>>> + uint16_t next = PCI_EXT_CAP_NEXT(header);
>>> +
>>> + /* Reject insertion inside an existing ECAP header (4 bytes) */
>>> + if (offset < pos + PCI_EXT_CAP_ALIGN) {
>>> + return false;
>>> + }
>>
>> TBH. I was expecting to see a table that is similar with
>> pci_ext_cap_length[][1], and a helper to walk the ext cap list
>> to figure out the spared pos. But it might be over-enginering as
>> we rely more on user to give an offset that is for sure no conflict
>> with existing ecaps nor hidden registers. Could you also add a comment
>> in the code to note it?
>>
>> [1]
>> https://github.com/torvalds/linux/blob/63804fed149a6750ffd28610c5c1c9
>> 8cce6bd377/drivers/vfio/pci/vfio_pci_config.c#L71
>>
>> With the above nit, LGTM.
>>
>> Reviewed-by: Yi Liu <yi.l.liu@intel.com>
>
> Thanks.
>
> Yes, the helper depends on the caller to provide a suitable offset, since
> it's not generally possible to validate this due to hidden registers and
> device specific capability layouts, and this is already noted in the commit
> log and function comment.
>
> If you had a specific wording in mind that would make this clearer, please
> let me know and I can incorporate it if this requires a respin.
no respin needed. Just noticed the comment in the function header is
enough. :)
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 35/37] hw/pci: Factor out common PASID capability initialization
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (33 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 34/37] hw/pci: Add helper to insert PCIe extended capability at a fixed offset Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:07 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 36/37] hw/vfio/pci: Synthesize PASID capability for vfio-pci devices Shameer Kolothum
` (2 subsequent siblings)
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho, Michael S . Tsirkin
Refactor PCIe PASID capability initialization by moving the common
register init into a new helper, pcie_pasid_common_init().
Subsequent patch to synthesize a vPASID will make use of this
helper.
No functional change intended.
Cc: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci/pcie.c | 19 ++++++++++++-------
include/hw/pci/pcie.h | 2 ++
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index aa9024e532..c481c16c0f 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1284,18 +1284,13 @@ void pcie_acs_reset(PCIDevice *dev)
}
}
-/* PASID */
-void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
- bool exec_perm, bool priv_mod)
+void pcie_pasid_common_init(PCIDevice *dev, uint16_t offset,
+ uint8_t pasid_width, bool exec_perm, bool priv_mod)
{
static const uint16_t control_reg_rw_mask = 0x07;
uint16_t capability_reg;
assert(pasid_width <= PCI_EXT_CAP_PASID_MAX_WIDTH);
-
- pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER, offset,
- PCI_EXT_CAP_PASID_SIZEOF);
-
capability_reg = ((uint16_t)pasid_width) << PCI_PASID_CAP_WIDTH_SHIFT;
capability_reg |= exec_perm ? PCI_PASID_CAP_EXEC : 0;
capability_reg |= priv_mod ? PCI_PASID_CAP_PRIV : 0;
@@ -1307,6 +1302,16 @@ void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
pci_set_word(dev->wmask + offset + PCI_PASID_CTRL, control_reg_rw_mask);
dev->exp.pasid_cap = offset;
+
+}
+
+/* PASID */
+void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
+ bool exec_perm, bool priv_mod)
+{
+ pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER, offset,
+ PCI_EXT_CAP_PASID_SIZEOF);
+ pcie_pasid_common_init(dev, offset, pasid_width, exec_perm, priv_mod);
}
/* PRI */
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index d68bfa6257..fc02aeb169 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -155,6 +155,8 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp);
+void pcie_pasid_common_init(PCIDevice *dev, uint16_t offset,
+ uint8_t pasid_width, bool exec_perm, bool priv_mod);
void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
bool exec_perm, bool priv_mod);
void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap,
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 35/37] hw/pci: Factor out common PASID capability initialization
2026-01-26 10:43 ` [PATCH v9 35/37] hw/pci: Factor out common PASID capability initialization Shameer Kolothum
@ 2026-01-26 12:07 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:07 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju,
Michael S . Tsirkin
On 2026/1/26 18:43, Shameer Kolothum wrote:
> Refactor PCIe PASID capability initialization by moving the common
> register init into a new helper, pcie_pasid_common_init().
>
> Subsequent patch to synthesize a vPASID will make use of this
> helper.
>
> No functional change intended.
>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/pci/pcie.c | 19 ++++++++++++-------
> include/hw/pci/pcie.h | 2 ++
> 2 files changed, 14 insertions(+), 7 deletions(-)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index aa9024e532..c481c16c0f 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -1284,18 +1284,13 @@ void pcie_acs_reset(PCIDevice *dev)
> }
> }
>
> -/* PASID */
> -void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
> - bool exec_perm, bool priv_mod)
> +void pcie_pasid_common_init(PCIDevice *dev, uint16_t offset,
> + uint8_t pasid_width, bool exec_perm, bool priv_mod)
> {
> static const uint16_t control_reg_rw_mask = 0x07;
> uint16_t capability_reg;
>
> assert(pasid_width <= PCI_EXT_CAP_PASID_MAX_WIDTH);
> -
> - pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER, offset,
> - PCI_EXT_CAP_PASID_SIZEOF);
> -
> capability_reg = ((uint16_t)pasid_width) << PCI_PASID_CAP_WIDTH_SHIFT;
> capability_reg |= exec_perm ? PCI_PASID_CAP_EXEC : 0;
> capability_reg |= priv_mod ? PCI_PASID_CAP_PRIV : 0;
> @@ -1307,6 +1302,16 @@ void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
> pci_set_word(dev->wmask + offset + PCI_PASID_CTRL, control_reg_rw_mask);
>
> dev->exp.pasid_cap = offset;
> +
> +}
> +
> +/* PASID */
> +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
> + bool exec_perm, bool priv_mod)
> +{
> + pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER, offset,
> + PCI_EXT_CAP_PASID_SIZEOF);
> + pcie_pasid_common_init(dev, offset, pasid_width, exec_perm, priv_mod);
> }
>
> /* PRI */
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index d68bfa6257..fc02aeb169 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -155,6 +155,8 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
> void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev,
> DeviceState *dev, Error **errp);
>
> +void pcie_pasid_common_init(PCIDevice *dev, uint16_t offset,
> + uint8_t pasid_width, bool exec_perm, bool priv_mod);
> void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width,
> bool exec_perm, bool priv_mod);
> void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap,
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 36/37] hw/vfio/pci: Synthesize PASID capability for vfio-pci devices
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (34 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 35/37] hw/pci: Factor out common PASID capability initialization Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 12:07 ` Yi Liu
2026-01-26 10:43 ` [PATCH v9 37/37] hw/arm/smmuv3-accel: Make SubstreamID support configurable Shameer Kolothum
2026-01-26 14:56 ` [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Peter Maydell
37 siblings, 1 reply; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
Add support for synthesizing a PCIe PASID extended capability for
vfio-pci devices when PASID is enabled via a vIOMMU and supported by
the host IOMMU backend.
PASID capability parameters are retrieved via IOMMUFD APIs and the
capability is inserted into the PCIe extended capability list using
the insertion helper. A new x-vpasid-cap-offset property allows
explicit control over the placement; by default the capability is
placed at the end of the PCIe extended configuration space.
If the kernel does not expose PASID information or insertion fails,
the device continues without PASID support.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/vfio/pci.c | 75 +++++++++++++++++++++++++++++++++++++++++
hw/vfio/pci.h | 1 +
hw/vfio/trace-events | 1 +
include/hw/core/iommu.h | 1 +
4 files changed, 78 insertions(+)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c734472721..36d8fbe872 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -24,6 +24,7 @@
#include <sys/ioctl.h>
#include "hw/core/hw-error.h"
+#include "hw/core/iommu.h"
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_bridge.h"
@@ -2498,9 +2499,62 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
return 0;
}
+/*
+ * Try to retrieve PASID capability information via IOMMUFD APIs and,
+ * if supported, synthesize a PASID PCIe extended capability for the
+ * VFIO device.
+ *
+ * Use user-specified PASID capability offset if provided, otherwise
+ * place it at the end of the PCIe extended configuration space.
+ */
+static bool vfio_pci_synthesize_pasid_cap(VFIOPCIDevice *vdev, Error **errp)
+{
+ HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
+ HostIOMMUDeviceClass *hiodc;
+ PasidInfo pasid_info;
+ PCIDevice *pdev = PCI_DEVICE(vdev);
+ uint16_t pasid_offset;
+
+ if (!hiod) {
+ return true;
+ }
+
+ hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+ if (!hiodc || !hiodc->get_pasid_info ||
+ !hiodc->get_pasid_info(hiod, &pasid_info) ||
+ !(pci_device_get_viommu_flags(pdev) & VIOMMU_FLAG_PASID_SUPPORTED)) {
+ return true;
+ }
+
+ /* Use user-specified offset if set, otherwise place PASID at the end. */
+ if (vdev->vpasid_cap_offset) {
+ pasid_offset = vdev->vpasid_cap_offset;
+ } else {
+ pasid_offset = PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF;
+ }
+
+ if (!pcie_insert_capability(pdev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER,
+ pasid_offset, PCI_EXT_CAP_PASID_SIZEOF)) {
+ error_setg(errp, "vfio: Placing PASID capability at offset 0x%x failed",
+ pasid_offset);
+ return false;
+ }
+ trace_vfio_pci_synthesize_pasid_cap(vdev->vbasedev.name, pasid_offset);
+
+ pcie_pasid_common_init(pdev, pasid_offset, pasid_info.max_pasid_log2,
+ pasid_info.exec_perm, pasid_info.priv_mod);
+
+ /* PASID capability is fully emulated by QEMU */
+ memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff,
+ PCI_EXT_CAP_PASID_SIZEOF);
+ return true;
+}
+
static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
{
PCIDevice *pdev = PCI_DEVICE(vdev);
+ bool pasid_cap_added = false;
+ Error *err = NULL;
uint32_t header;
uint16_t cap_id, next, size;
uint8_t cap_ver;
@@ -2578,12 +2632,24 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
pcie_add_capability(pdev, cap_id, cap_ver, next, size);
}
break;
+ /*
+ * VFIO kernel does not expose the PASID CAP today. We may synthesize
+ * one later through IOMMUFD APIs. If VFIO ever starts exposing it,
+ * record its presence here so we do not create a duplicate CAP.
+ */
+ case PCI_EXT_CAP_ID_PASID:
+ pasid_cap_added = true;
+ /* fallthrough */
default:
pcie_add_capability(pdev, cap_id, cap_ver, next, size);
}
}
+ if (!pasid_cap_added && !vfio_pci_synthesize_pasid_cap(vdev, &err)) {
+ error_report_err(err);
+ }
+
/* Cleanup chain head ID if necessary */
if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
@@ -3756,6 +3822,8 @@ static const Property vfio_pci_properties[] = {
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
#endif
DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
+ DEFINE_PROP_UINT16("x-vpasid-cap-offset", VFIOPCIDevice,
+ vpasid_cap_offset, 0),
};
#ifdef CONFIG_IOMMUFD
@@ -3913,6 +3981,13 @@ static void vfio_pci_class_init(ObjectClass *klass, const void *data)
"destination when doing live "
"migration of device state via "
"multifd channels");
+ object_class_property_set_description(klass, /* 11.0 */
+ "x-vpasid-cap-offset",
+ "PCIe extended configuration space offset at which to place a "
+ "synthetic PASID extended capability when PASID is enabled via "
+ "a vIOMMU. A value of 0 (default) places the capability at the "
+ "end of the extended configuration space. The offset must be "
+ "4-byte aligned and within the PCIe extended configuration space");
}
static const TypeInfo vfio_pci_info = {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 0f78cf9cdb..d6495d7f29 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -187,6 +187,7 @@ struct VFIOPCIDevice {
bool defer_kvm_irq_routing;
bool clear_parent_atomics_on_exit;
bool skip_vsc_check;
+ uint16_t vpasid_cap_offset;
VFIODisplay *dpy;
Notifier irqchip_change_notifier;
VFIOPCICPR cpr;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 180e3d526b..b48c4abe7a 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -40,6 +40,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
+vfio_pci_synthesize_pasid_cap(const char *name, uint16_t offset) "%s offset: 0x%x"
vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
vfio_pci_reset(const char *name) " (%s)"
vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
index d5401a397b..86af315c15 100644
--- a/include/hw/core/iommu.h
+++ b/include/hw/core/iommu.h
@@ -20,6 +20,7 @@
enum viommu_flags {
/* vIOMMU needs nesting parent HWPT to create nested HWPT */
VIOMMU_FLAG_WANT_NESTING_PARENT = BIT_ULL(0),
+ VIOMMU_FLAG_PASID_SUPPORTED = BIT_ULL(1),
};
/* Host IOMMU quirks. Extracted from host IOMMU capabilities */
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 36/37] hw/vfio/pci: Synthesize PASID capability for vfio-pci devices
2026-01-26 10:43 ` [PATCH v9 36/37] hw/vfio/pci: Synthesize PASID capability for vfio-pci devices Shameer Kolothum
@ 2026-01-26 12:07 ` Yi Liu
0 siblings, 0 replies; 68+ messages in thread
From: Yi Liu @ 2026-01-26 12:07 UTC (permalink / raw)
To: Shameer Kolothum, qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, kjaju
On 2026/1/26 18:43, Shameer Kolothum wrote:
> Add support for synthesizing a PCIe PASID extended capability for
> vfio-pci devices when PASID is enabled via a vIOMMU and supported by
> the host IOMMU backend.
>
> PASID capability parameters are retrieved via IOMMUFD APIs and the
> capability is inserted into the PCIe extended capability list using
> the insertion helper. A new x-vpasid-cap-offset property allows
> explicit control over the placement; by default the capability is
> placed at the end of the PCIe extended configuration space.
>
> If the kernel does not expose PASID information or insertion fails,
> the device continues without PASID support.
>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Reviewed-by: Cédric Le Goater <clg@redhat.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> hw/vfio/pci.c | 75 +++++++++++++++++++++++++++++++++++++++++
> hw/vfio/pci.h | 1 +
> hw/vfio/trace-events | 1 +
> include/hw/core/iommu.h | 1 +
> 4 files changed, 78 insertions(+)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index c734472721..36d8fbe872 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -24,6 +24,7 @@
> #include <sys/ioctl.h>
>
> #include "hw/core/hw-error.h"
> +#include "hw/core/iommu.h"
> #include "hw/pci/msi.h"
> #include "hw/pci/msix.h"
> #include "hw/pci/pci_bridge.h"
> @@ -2498,9 +2499,62 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
> return 0;
> }
>
> +/*
> + * Try to retrieve PASID capability information via IOMMUFD APIs and,
> + * if supported, synthesize a PASID PCIe extended capability for the
> + * VFIO device.
> + *
> + * Use user-specified PASID capability offset if provided, otherwise
> + * place it at the end of the PCIe extended configuration space.
> + */
> +static bool vfio_pci_synthesize_pasid_cap(VFIOPCIDevice *vdev, Error **errp)
> +{
> + HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
> + HostIOMMUDeviceClass *hiodc;
> + PasidInfo pasid_info;
> + PCIDevice *pdev = PCI_DEVICE(vdev);
> + uint16_t pasid_offset;
> +
> + if (!hiod) {
> + return true;
> + }
> +
> + hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
> + if (!hiodc || !hiodc->get_pasid_info ||
> + !hiodc->get_pasid_info(hiod, &pasid_info) ||
> + !(pci_device_get_viommu_flags(pdev) & VIOMMU_FLAG_PASID_SUPPORTED)) {
> + return true;
> + }
> +
> + /* Use user-specified offset if set, otherwise place PASID at the end. */
> + if (vdev->vpasid_cap_offset) {
> + pasid_offset = vdev->vpasid_cap_offset;
> + } else {
> + pasid_offset = PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF;
> + }
> +
> + if (!pcie_insert_capability(pdev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER,
> + pasid_offset, PCI_EXT_CAP_PASID_SIZEOF)) {
> + error_setg(errp, "vfio: Placing PASID capability at offset 0x%x failed",
> + pasid_offset);
> + return false;
> + }
> + trace_vfio_pci_synthesize_pasid_cap(vdev->vbasedev.name, pasid_offset);
> +
> + pcie_pasid_common_init(pdev, pasid_offset, pasid_info.max_pasid_log2,
> + pasid_info.exec_perm, pasid_info.priv_mod);
> +
> + /* PASID capability is fully emulated by QEMU */
> + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff,
> + PCI_EXT_CAP_PASID_SIZEOF);
> + return true;
> +}
> +
> static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> {
> PCIDevice *pdev = PCI_DEVICE(vdev);
> + bool pasid_cap_added = false;
> + Error *err = NULL;
> uint32_t header;
> uint16_t cap_id, next, size;
> uint8_t cap_ver;
> @@ -2578,12 +2632,24 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> }
> break;
> + /*
> + * VFIO kernel does not expose the PASID CAP today. We may synthesize
> + * one later through IOMMUFD APIs. If VFIO ever starts exposing it,
> + * record its presence here so we do not create a duplicate CAP.
> + */
> + case PCI_EXT_CAP_ID_PASID:
> + pasid_cap_added = true;
> + /* fallthrough */
> default:
> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> }
>
> }
>
> + if (!pasid_cap_added && !vfio_pci_synthesize_pasid_cap(vdev, &err)) {
> + error_report_err(err);
> + }
> +
> /* Cleanup chain head ID if necessary */
> if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
> @@ -3756,6 +3822,8 @@ static const Property vfio_pci_properties[] = {
> TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
> #endif
> DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
> + DEFINE_PROP_UINT16("x-vpasid-cap-offset", VFIOPCIDevice,
> + vpasid_cap_offset, 0),
> };
>
> #ifdef CONFIG_IOMMUFD
> @@ -3913,6 +3981,13 @@ static void vfio_pci_class_init(ObjectClass *klass, const void *data)
> "destination when doing live "
> "migration of device state via "
> "multifd channels");
> + object_class_property_set_description(klass, /* 11.0 */
> + "x-vpasid-cap-offset",
> + "PCIe extended configuration space offset at which to place a "
> + "synthetic PASID extended capability when PASID is enabled via "
> + "a vIOMMU. A value of 0 (default) places the capability at the "
> + "end of the extended configuration space. The offset must be "
> + "4-byte aligned and within the PCIe extended configuration space");
> }
>
> static const TypeInfo vfio_pci_info = {
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 0f78cf9cdb..d6495d7f29 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -187,6 +187,7 @@ struct VFIOPCIDevice {
> bool defer_kvm_irq_routing;
> bool clear_parent_atomics_on_exit;
> bool skip_vsc_check;
> + uint16_t vpasid_cap_offset;
> VFIODisplay *dpy;
> Notifier irqchip_change_notifier;
> VFIOPCICPR cpr;
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 180e3d526b..b48c4abe7a 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -40,6 +40,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
> vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
> vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
> vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
> +vfio_pci_synthesize_pasid_cap(const char *name, uint16_t offset) "%s offset: 0x%x"
> vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
> vfio_pci_reset(const char *name) " (%s)"
> vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> index d5401a397b..86af315c15 100644
> --- a/include/hw/core/iommu.h
> +++ b/include/hw/core/iommu.h
> @@ -20,6 +20,7 @@
> enum viommu_flags {
> /* vIOMMU needs nesting parent HWPT to create nested HWPT */
> VIOMMU_FLAG_WANT_NESTING_PARENT = BIT_ULL(0),
> + VIOMMU_FLAG_PASID_SUPPORTED = BIT_ULL(1),
> };
>
> /* Host IOMMU quirks. Extracted from host IOMMU capabilities */
^ permalink raw reply [flat|nested] 68+ messages in thread
* [PATCH v9 37/37] hw/arm/smmuv3-accel: Make SubstreamID support configurable
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (35 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 36/37] hw/vfio/pci: Synthesize PASID capability for vfio-pci devices Shameer Kolothum
@ 2026-01-26 10:43 ` Shameer Kolothum
2026-01-26 14:56 ` [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Peter Maydell
37 siblings, 0 replies; 68+ messages in thread
From: Shameer Kolothum @ 2026-01-26 10:43 UTC (permalink / raw)
To: qemu-arm, qemu-devel
Cc: eric.auger, peter.maydell, jgg, nicolinc, ddutile, berrange, clg,
alex, nathanc, mochs, smostafa, wangzhou1, jiangkunkun,
jonathan.cameron, zhangfei.gao, zhenzhong.duan, yi.l.liu, kjaju,
skolothumtho
QEMU SMMUv3 currently reports no SubstreamID support, forcing SSID to
zero. This prevents accelerated use cases such as Shared Virtual
Addressing (SVA), which require multiple Stage-1 context descriptors
indexed by SubstreamID.
Add a new "ssidsize" property to explicitly configure the number of bits
used for SubstreamIDs. A value greater than zero enables SubstreamID
support and advertises PASID capability to the vIOMMU.
The requested SSIDSIZE is validated against host SMMUv3 capabilities and
is only supported when accel=on.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 25 ++++++++++++++++++++++++-
hw/arm/smmuv3.c | 22 ++++++++++++++++++++--
include/hw/arm/smmuv3-common.h | 1 +
include/hw/arm/smmuv3.h | 1 +
4 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 342944da23..f5cd4df336 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -76,6 +76,16 @@ smmuv3_accel_check_hw_compatible(SMMUv3State *s,
return false;
}
+ /* Check SSIDSIZE value opted-in is compatible with Host SMMUv3 SSIDSIZE */
+ if (FIELD_EX32(info->idr[1], IDR1, SSIDSIZE) <
+ FIELD_EX32(s->idr[1], IDR1, SSIDSIZE)) {
+ error_setg(errp, "Host SMMUv3 SSIDSIZE not compatible "
+ "(host=%u, QEMU=%u)",
+ FIELD_EX32(info->idr[1], IDR1, SSIDSIZE),
+ FIELD_EX32(s->idr[1], IDR1, SSIDSIZE));
+ return false;
+ }
+
/* User can disable QEMU SMMUv3 Range Invalidation support */
if (FIELD_EX32(info->idr[3], IDR3, RIL) <
FIELD_EX32(s->idr[3], IDR3, RIL)) {
@@ -652,7 +662,14 @@ static uint64_t smmuv3_accel_get_viommu_flags(void *opaque)
* The real HW nested support should be reported from host SMMUv3 and if
* it doesn't, the nesting parent allocation will fail anyway in VFIO core.
*/
- return VIOMMU_FLAG_WANT_NESTING_PARENT;
+ uint64_t flags = VIOMMU_FLAG_WANT_NESTING_PARENT;
+ SMMUState *bs = opaque;
+ SMMUv3State *s = ARM_SMMUV3(bs);
+
+ if (s->ssidsize) {
+ flags |= VIOMMU_FLAG_PASID_SUPPORTED;
+ }
+ return flags;
}
static const PCIIOMMUOps smmuv3_accel_ops = {
@@ -680,6 +697,12 @@ void smmuv3_accel_idr_override(SMMUv3State *s)
if (s->oas == SMMU_OAS_48BIT) {
s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS_48);
}
+
+ /*
+ * By default QEMU SMMUv3 has no SubstreamID support. Update IDR1 if user
+ * has enabled it.
+ */
+ s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, s->ssidsize);
}
/* Based on SMUUv3 GPBA.ABORT configuration, attach a corresponding HWPT */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index cb02184d2d..c08d58c579 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -611,9 +611,11 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
}
}
- if (STE_S1CDMAX(ste) != 0) {
+ /* Multiple context descriptors require SubstreamID support */
+ if (!s->ssidsize && STE_S1CDMAX(ste) != 0) {
qemu_log_mask(LOG_UNIMP,
- "SMMUv3 does not support multiple context descriptors yet\n");
+ "SMMUv3: multiple S1 context descriptors require SubstreamID support. "
+ "Configure ssidsize > 0 (requires accel=on)\n");
goto bad_ste;
}
@@ -1954,6 +1956,10 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
error_setg(errp, "OAS must be 44 bits when accel=off");
return false;
}
+ if (s->ssidsize) {
+ error_setg(errp, "ssidsize can only be set if accel=on");
+ return false;
+ }
return true;
}
@@ -1968,6 +1974,11 @@ static bool smmu_validate_property(SMMUv3State *s, Error **errp)
error_setg(errp, "OAS can only be set to 44 or 48 bits");
return false;
}
+ if (s->ssidsize > SMMU_SSID_MAX_BITS) {
+ error_setg(errp, "ssidsize must be in the range 0 to %d",
+ SMMU_SSID_MAX_BITS);
+ return false;
+ }
return true;
}
@@ -2096,6 +2107,7 @@ static const Property smmuv3_properties[] = {
DEFINE_PROP_BOOL("ril", SMMUv3State, ril, true),
DEFINE_PROP_BOOL("ats", SMMUv3State, ats, false),
DEFINE_PROP_UINT8("oas", SMMUv3State, oas, 44),
+ DEFINE_PROP_UINT8("ssidsize", SMMUv3State, ssidsize, 0),
};
static void smmuv3_instance_init(Object *obj)
@@ -2129,6 +2141,12 @@ static void smmuv3_class_init(ObjectClass *klass, const void *data)
object_class_property_set_description(klass, "oas",
"Specify Output Address Size (for accel=on). Supported values "
"are 44 or 48 bits. Defaults to 44 bits");
+ object_class_property_set_description(klass, "ssidsize",
+ "Number of bits used to represent SubstreamIDs (SSIDs). "
+ "A value of N allows SSIDs in the range [0 .. 2^N - 1]. "
+ "Valid range is 0-20, where 0 disables SubstreamID support. "
+ "Defaults to 0. A value greater than 0 is required to enable "
+ "PASID support.");
}
static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
diff --git a/include/hw/arm/smmuv3-common.h b/include/hw/arm/smmuv3-common.h
index abe3565357..67a23fbeaa 100644
--- a/include/hw/arm/smmuv3-common.h
+++ b/include/hw/arm/smmuv3-common.h
@@ -311,6 +311,7 @@ REG32(IDR1, 0x4)
FIELD(IDR1, TABLES_PRESET, 30, 1)
FIELD(IDR1, ECMDQ, 31, 1)
+#define SMMU_SSID_MAX_BITS 20
#define SMMU_IDR1_SIDSIZE 16
#define SMMU_CMDQS 19
#define SMMU_EVENTQS 19
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index d488a39cd0..26b2fc42fd 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -72,6 +72,7 @@ struct SMMUv3State {
bool ril;
bool ats;
uint8_t oas;
+ uint8_t ssidsize;
};
typedef enum {
--
2.43.0
^ permalink raw reply related [flat|nested] 68+ messages in thread* Re: [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3
2026-01-26 10:42 [PATCH v9 00/37] hw/arm/virt: Add support for user-creatable accelerated SMMUv3 Shameer Kolothum
` (36 preceding siblings ...)
2026-01-26 10:43 ` [PATCH v9 37/37] hw/arm/smmuv3-accel: Make SubstreamID support configurable Shameer Kolothum
@ 2026-01-26 14:56 ` Peter Maydell
37 siblings, 0 replies; 68+ messages in thread
From: Peter Maydell @ 2026-01-26 14:56 UTC (permalink / raw)
To: Shameer Kolothum
Cc: qemu-arm, qemu-devel, eric.auger, jgg, nicolinc, ddutile,
berrange, clg, alex, nathanc, mochs, smostafa, wangzhou1,
jiangkunkun, jonathan.cameron, zhangfei.gao, zhenzhong.duan,
yi.l.liu, kjaju
On Mon, 26 Jan 2026 at 10:44, Shameer Kolothum <skolothumtho@nvidia.com> wrote:
>
> Hi,
>
> Changes from v8:
> https://lore.kernel.org/qemu-devel/20260121175248.87649-1-skolothumtho@nvidia.com/
> - Collected R-b tags. Thanks!.
> - Initialise the AcpiIortSMMUv3Dev struct(patch #25) to fix the reported
> CI test failure[0]. Please find the latest CI run results here[1].
Thanks for investigating the CI failure -- I've applied this version
to target-arm.next.
-- PMM
^ permalink raw reply [flat|nested] 68+ messages in thread