qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] vfio/spapr: Fix L2 crash with PCI device passthrough with L2 guest memory > 128G
@ 2025-04-04  9:17 Amit Machhiwal
  2025-04-04 11:29 ` Cédric Le Goater
  0 siblings, 1 reply; 3+ messages in thread
From: Amit Machhiwal @ 2025-04-04  9:17 UTC (permalink / raw)
  To: qemu-ppc, Nicholas Piggin, Harsh Prateek Bora
  Cc: qemu-devel, Amit Machhiwal, Vaibhav Jain, Shivaprasad G Bhat,
	Daniel Henrique Barboza, Alex Williamson, Cédric Le Goater

An L2 KVM guest fails to boot inside a pSeries LPAR when booted with a
memory more than 128 GB and PCI device passthrough. The L2 guest also
crashes when it is booted with a memory greater than 128 GB and a PCI
device is hotplugged later.

The issue arises from a conditional check for `levels > 1` in
`spapr_tce_create_table()` within L1 KVM. This check is meant to prevent
multi-level TCEs, which are not supported by the PowerVM hypervisor. As
a result, when QEMU makes a `VFIO_IOMMU_SPAPR_TCE_CREATE` ioctl call
with `levels > 1`, it triggers the conditional check and returns
`EINVAL`, causing the guest to crash with the following errors:

 2025-03-04T06:36:36.133117Z qemu-system-ppc64: Failed to create a window, ret = -1 (Invalid argument)
 2025-03-04T06:36:36.133176Z qemu-system-ppc64: Failed to create SPAPR window: Invalid argument
 qemu: hardware error: vfio: DMA mapping failed, unable to continue

Fix this by checking the supported DDW "levels" returned by the
VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctl before attempting the TCE create
ioctl in KVM.

The patch has been tested on KVM guests with memory configurations of up
to 390GB, and 450GB on PowerVM and bare-metal environments respectively.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 hw/vfio/spapr.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 1a5d1611f2cd..07498218fea9 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -26,6 +26,7 @@ typedef struct VFIOSpaprContainer {
     VFIOContainer container;
     MemoryListener prereg_listener;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+    unsigned int levels;
 } VFIOSpaprContainer;
 
 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
@@ -236,9 +237,11 @@ static int vfio_spapr_create_window(VFIOContainer *container,
 {
     int ret = 0;
     VFIOContainerBase *bcontainer = &container->bcontainer;
+    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+                                                  container);
     IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
     uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
-    unsigned entries, bits_total, bits_per_level, max_levels;
+    unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
     struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
     long rampagesize = qemu_minrampagesize();
 
@@ -291,16 +294,28 @@ static int vfio_spapr_create_window(VFIOContainer *container,
      */
     bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
     create.levels = bits_total / bits_per_level;
-    if (bits_total % bits_per_level) {
-        ++create.levels;
-    }
-    max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
-    for ( ; create.levels <= max_levels; ++create.levels) {
-        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
-        if (!ret) {
-            break;
+
+    ddw_levels = scontainer->levels;
+    if (ddw_levels > 1) {
+        if (bits_total % bits_per_level) {
+            ++create.levels;
         }
+        max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
+        for ( ; create.levels <= max_levels; ++create.levels) {
+            ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+            if (!ret) {
+                break;
+            }
+        }
+    } else { /* ddw_levels == 1 */
+        if (create.levels > ddw_levels) {
+            error_report("Host doesn't support multi-level TCE tables. "
+                         "Use larger IO page size. Supported mask is 0x%lx",
+                         bcontainer->pgsizes);
+        }
+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
     }
+
     if (ret) {
         error_report("Failed to create a window, ret = %d (%m)", ret);
         return -errno;
@@ -502,6 +517,8 @@ static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
         goto listener_unregister_exit;
     }
 
+    scontainer->levels = info.ddw.levels;
+
     if (v2) {
         bcontainer->pgsizes = info.ddw.pgsizes;
         /*

base-commit: 0adf626718bc0ca9c46550249a76047f8e45da15
-- 
2.49.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] vfio/spapr: Fix L2 crash with PCI device passthrough with L2 guest memory > 128G
  2025-04-04  9:17 [PATCH] vfio/spapr: Fix L2 crash with PCI device passthrough with L2 guest memory > 128G Amit Machhiwal
@ 2025-04-04 11:29 ` Cédric Le Goater
  2025-04-07  8:45   ` Amit Machhiwal
  0 siblings, 1 reply; 3+ messages in thread
From: Cédric Le Goater @ 2025-04-04 11:29 UTC (permalink / raw)
  To: Amit Machhiwal, qemu-ppc, Nicholas Piggin, Harsh Prateek Bora
  Cc: qemu-devel, Vaibhav Jain, Shivaprasad G Bhat,
	Daniel Henrique Barboza, Alex Williamson

On 4/4/25 11:17, Amit Machhiwal wrote:
> An L2 KVM guest fails to boot inside a pSeries LPAR when booted with a
> memory more than 128 GB and PCI device passthrough. The L2 guest also
> crashes when it is booted with a memory greater than 128 GB and a PCI
> device is hotplugged later.
> 
> The issue arises from a conditional check for `levels > 1` in
> `spapr_tce_create_table()` within L1 KVM. This check is meant to prevent
> multi-level TCEs, which are not supported by the PowerVM hypervisor. As
> a result, when QEMU makes a `VFIO_IOMMU_SPAPR_TCE_CREATE` ioctl call
> with `levels > 1`, it triggers the conditional check and returns
> `EINVAL`, causing the guest to crash with the following errors:
> 
>   2025-03-04T06:36:36.133117Z qemu-system-ppc64: Failed to create a window, ret = -1 (Invalid argument)
>   2025-03-04T06:36:36.133176Z qemu-system-ppc64: Failed to create SPAPR window: Invalid argument
>   qemu: hardware error: vfio: DMA mapping failed, unable to continue
> 
> Fix this by checking the supported DDW "levels" returned by the
> VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctl before attempting the TCE create
> ioctl in KVM.
> 
> The patch has been tested on KVM guests with memory configurations of up
> to 390GB, and 450GB on PowerVM and bare-metal environments respectively.
> > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> ---
>   hw/vfio/spapr.c | 35 ++++++++++++++++++++++++++---------
>   1 file changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> index 1a5d1611f2cd..07498218fea9 100644
> --- a/hw/vfio/spapr.c
> +++ b/hw/vfio/spapr.c
> @@ -26,6 +26,7 @@ typedef struct VFIOSpaprContainer {
>       VFIOContainer container;
>       MemoryListener prereg_listener;
>       QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
> +    unsigned int levels;
>   } VFIOSpaprContainer;
>   
>   OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
> @@ -236,9 +237,11 @@ static int vfio_spapr_create_window(VFIOContainer *container,
>   {
>       int ret = 0;
>       VFIOContainerBase *bcontainer = &container->bcontainer;
> +    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
> +                                                  container);
>       IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
>       uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
> -    unsigned entries, bits_total, bits_per_level, max_levels;
> +    unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
>       struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
>       long rampagesize = qemu_minrampagesize();
>   
> @@ -291,16 +294,28 @@ static int vfio_spapr_create_window(VFIOContainer *container,
>        */
>       bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
>       create.levels = bits_total / bits_per_level;
> -    if (bits_total % bits_per_level) {
> -        ++create.levels;
> -    }
> -    max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
> -    for ( ; create.levels <= max_levels; ++create.levels) {
> -        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> -        if (!ret) {
> -            break;
> +
> +    ddw_levels = scontainer->levels;
> +    if (ddw_levels > 1) {
> +        if (bits_total % bits_per_level) {
> +            ++create.levels;
>           }
> +        max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
> +        for ( ; create.levels <= max_levels; ++create.levels) {
> +            ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> +            if (!ret) {
> +                break;
> +            }
> +        }
> +    } else { /* ddw_levels == 1 */
> +        if (create.levels > ddw_levels) {
> +            error_report("Host doesn't support multi-level TCE tables. "
> +                         "Use larger IO page size. Supported mask is 0x%lx",
> +                         bcontainer->pgsizes);

While at it, please modify vfio_spapr_create_window(), add an 'Error **'
parameter to report errors to the caller with error_setg(errp ...)

Thanks,

C.




> +        }
> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>       }
> +
>       if (ret) {
>           error_report("Failed to create a window, ret = %d (%m)", ret);
>           return -errno;
> @@ -502,6 +517,8 @@ static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
>           goto listener_unregister_exit;
>       }
>   
> +    scontainer->levels = info.ddw.levels;
> +
>       if (v2) {
>           bcontainer->pgsizes = info.ddw.pgsizes;
>           /*
> 
> base-commit: 0adf626718bc0ca9c46550249a76047f8e45da15



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] vfio/spapr: Fix L2 crash with PCI device passthrough with L2 guest memory > 128G
  2025-04-04 11:29 ` Cédric Le Goater
@ 2025-04-07  8:45   ` Amit Machhiwal
  0 siblings, 0 replies; 3+ messages in thread
From: Amit Machhiwal @ 2025-04-07  8:45 UTC (permalink / raw)
  To: Cédric Le Goater
  Cc: qemu-ppc, Nicholas Piggin, Harsh Prateek Bora, qemu-devel,
	Vaibhav Jain, Shivaprasad G Bhat, Daniel Henrique Barboza,
	Alex Williamson

Hi Cédric,

Thanks for looking into this patch. Please find my response inline:

On 2025/04/04 01:29 PM, Cédric Le Goater wrote:
> On 4/4/25 11:17, Amit Machhiwal wrote:
> > An L2 KVM guest fails to boot inside a pSeries LPAR when booted with a
> > memory more than 128 GB and PCI device passthrough. The L2 guest also
> > crashes when it is booted with a memory greater than 128 GB and a PCI
> > device is hotplugged later.
> > 
> > The issue arises from a conditional check for `levels > 1` in
> > `spapr_tce_create_table()` within L1 KVM. This check is meant to prevent
> > multi-level TCEs, which are not supported by the PowerVM hypervisor. As
> > a result, when QEMU makes a `VFIO_IOMMU_SPAPR_TCE_CREATE` ioctl call
> > with `levels > 1`, it triggers the conditional check and returns
> > `EINVAL`, causing the guest to crash with the following errors:
> > 
> >   2025-03-04T06:36:36.133117Z qemu-system-ppc64: Failed to create a window, ret = -1 (Invalid argument)
> >   2025-03-04T06:36:36.133176Z qemu-system-ppc64: Failed to create SPAPR window: Invalid argument
> >   qemu: hardware error: vfio: DMA mapping failed, unable to continue
> > 
> > Fix this by checking the supported DDW "levels" returned by the
> > VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctl before attempting the TCE create
> > ioctl in KVM.
> > 
> > The patch has been tested on KVM guests with memory configurations of up
> > to 390GB, and 450GB on PowerVM and bare-metal environments respectively.
> > > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> > ---
> >   hw/vfio/spapr.c | 35 ++++++++++++++++++++++++++---------
> >   1 file changed, 26 insertions(+), 9 deletions(-)
> > 
> > diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> > index 1a5d1611f2cd..07498218fea9 100644
> > --- a/hw/vfio/spapr.c
> > +++ b/hw/vfio/spapr.c
> > @@ -26,6 +26,7 @@ typedef struct VFIOSpaprContainer {
> >       VFIOContainer container;
> >       MemoryListener prereg_listener;
> >       QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
> > +    unsigned int levels;
> >   } VFIOSpaprContainer;
> >   OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
> > @@ -236,9 +237,11 @@ static int vfio_spapr_create_window(VFIOContainer *container,
> >   {
> >       int ret = 0;
> >       VFIOContainerBase *bcontainer = &container->bcontainer;
> > +    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
> > +                                                  container);
> >       IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
> >       uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
> > -    unsigned entries, bits_total, bits_per_level, max_levels;
> > +    unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
> >       struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
> >       long rampagesize = qemu_minrampagesize();
> > @@ -291,16 +294,28 @@ static int vfio_spapr_create_window(VFIOContainer *container,
> >        */
> >       bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
> >       create.levels = bits_total / bits_per_level;
> > -    if (bits_total % bits_per_level) {
> > -        ++create.levels;
> > -    }
> > -    max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
> > -    for ( ; create.levels <= max_levels; ++create.levels) {
> > -        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> > -        if (!ret) {
> > -            break;
> > +
> > +    ddw_levels = scontainer->levels;
> > +    if (ddw_levels > 1) {
> > +        if (bits_total % bits_per_level) {
> > +            ++create.levels;
> >           }
> > +        max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
> > +        for ( ; create.levels <= max_levels; ++create.levels) {
> > +            ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> > +            if (!ret) {
> > +                break;
> > +            }
> > +        }
> > +    } else { /* ddw_levels == 1 */
> > +        if (create.levels > ddw_levels) {
> > +            error_report("Host doesn't support multi-level TCE tables. "
> > +                         "Use larger IO page size. Supported mask is 0x%lx",
> > +                         bcontainer->pgsizes);
> 
> While at it, please modify vfio_spapr_create_window(), add an 'Error **'
> parameter to report errors to the caller with error_setg(errp ...)

Sure, I'll include the suggested changes and send a v2 soon.

Thanks,
Amit

> 
> Thanks,
> 
> C.
> 
> 
> 
> 
> > +        }
> > +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> >       }
> > +
> >       if (ret) {
> >           error_report("Failed to create a window, ret = %d (%m)", ret);
> >           return -errno;
> > @@ -502,6 +517,8 @@ static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
> >           goto listener_unregister_exit;
> >       }
> > +    scontainer->levels = info.ddw.levels;
> > +
> >       if (v2) {
> >           bcontainer->pgsizes = info.ddw.pgsizes;
> >           /*
> > 
> > base-commit: 0adf626718bc0ca9c46550249a76047f8e45da15
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-04-07  8:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-04  9:17 [PATCH] vfio/spapr: Fix L2 crash with PCI device passthrough with L2 guest memory > 128G Amit Machhiwal
2025-04-04 11:29 ` Cédric Le Goater
2025-04-07  8:45   ` Amit Machhiwal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).