All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/3] new CONFIG_HAS_PIRQ and extra_guest_irqs adjustment
@ 2023-07-27  7:37 Jan Beulich
  2023-07-27  7:38 ` [PATCH v3 1/3] restrict concept of pIRQ to x86 Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Jan Beulich @ 2023-07-27  7:37 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné, Bertrand Marquis,
	Volodymyr Babchuk

New patch 2 addresses issues found while looking at pirq_cleanup_check()
in a Misra context.

1: restrict concept of pIRQ to x86
2: pirq_cleanup_check() leaks
3: cmdline: document and enforce "extra_guest_irqs" upper bounds

Jan


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 1/3] restrict concept of pIRQ to x86
  2023-07-27  7:37 [PATCH v3 0/3] new CONFIG_HAS_PIRQ and extra_guest_irqs adjustment Jan Beulich
@ 2023-07-27  7:38 ` Jan Beulich
  2024-03-12 22:54   ` Julien Grall
  2023-07-27  7:38 ` [PATCH v3 2/3] pirq_cleanup_check() leaks Jan Beulich
  2023-07-27  7:38 ` [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds Jan Beulich
  2 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2023-07-27  7:38 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné, Bertrand Marquis,
	Volodymyr Babchuk

... by way of a new arch-selectable Kconfig control.

Note that some smaller pieces of code are left without #ifdef, to keep
things better readable. Hence items like ECS_PIRQ, nr_static_irqs, or
domain_pirq_to_irq() remain uniformly.

As to XEN_DOMCTL_irq_permission - this, despite having a uniformly
available wrapper in libxc, is unused on Arm: libxl bypasses those
calls, and the Python and OCamL bindings have no users at all.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Move #ifdef in xen/common/domctl.c around the entire sub-op.
    Re-base.
v2: New.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1136,7 +1136,7 @@ introduced with the Nehalem architecture
       intended as an emergency option for people who first chose fast, then
       change their minds to secure, and wish not to reboot.**
 
-### extra_guest_irqs
+### extra_guest_irqs (x86)
 > `= [<domU number>][,<dom0 number>]`
 
 > Default: `32,<variable>`
--- a/xen/arch/arm/include/asm/irq.h
+++ b/xen/arch/arm/include/asm/irq.h
@@ -52,7 +52,6 @@ struct arch_irq_desc {
 
 extern const unsigned int nr_irqs;
 #define nr_static_irqs NR_IRQS
-#define arch_hwdom_irqs(domid) NR_IRQS
 
 struct irq_desc;
 struct irqaction;
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
 	select HAS_PCI
 	select HAS_PCI_MSI
 	select HAS_PDX
+	select HAS_PIRQ
 	select HAS_SCHED_GRANULARITY
 	select HAS_UBSAN
 	select HAS_VPCI if HVM
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -56,6 +56,9 @@ config HAS_KEXEC
 config HAS_PDX
 	bool
 
+config HAS_PIRQ
+	bool
+
 config HAS_PMAP
 	bool
 
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -350,6 +350,8 @@ static int late_hwdom_init(struct domain
 #endif
 }
 
+#ifdef CONFIG_HAS_PIRQ
+
 static unsigned int __read_mostly extra_hwdom_irqs;
 static unsigned int __read_mostly extra_domU_irqs = 32;
 
@@ -364,6 +366,8 @@ static int __init cf_check parse_extra_g
 }
 custom_param("extra_guest_irqs", parse_extra_guest_irqs);
 
+#endif /* CONFIG_HAS_PIRQ */
+
 static int __init cf_check parse_dom0_param(const char *s)
 {
     const char *ss;
@@ -682,6 +686,7 @@ struct domain *domain_create(domid_t dom
     if ( is_system_domain(d) && !is_idle_domain(d) )
         return d;
 
+#ifdef CONFIG_HAS_PIRQ
     if ( !is_idle_domain(d) )
     {
         if ( !is_hardware_domain(d) )
@@ -693,6 +698,7 @@ struct domain *domain_create(domid_t dom
 
         radix_tree_init(&d->pirq_tree);
     }
+#endif
 
     if ( (err = arch_domain_create(d, config, flags)) != 0 )
         goto fail;
@@ -784,7 +790,9 @@ struct domain *domain_create(domid_t dom
     {
         evtchn_destroy(d);
         evtchn_destroy_final(d);
+#ifdef CONFIG_HAS_PIRQ
         radix_tree_destroy(&d->pirq_tree, free_pirq_struct);
+#endif
     }
     if ( init_status & INIT_watchdog )
         watchdog_domain_destroy(d);
@@ -1180,7 +1188,9 @@ static void cf_check complete_domain_des
 
     evtchn_destroy_final(d);
 
+#ifdef CONFIG_HAS_PIRQ
     radix_tree_destroy(&d->pirq_tree, free_pirq_struct);
+#endif
 
     xfree(d->vcpu);
 
@@ -1893,6 +1903,8 @@ long do_vm_assist(unsigned int cmd, unsi
 }
 #endif
 
+#ifdef CONFIG_HAS_PIRQ
+
 struct pirq *pirq_get_info(struct domain *d, int pirq)
 {
     struct pirq *info = pirq_info(d, pirq);
@@ -1922,6 +1934,8 @@ void cf_check free_pirq_struct(void *ptr
     call_rcu(&pirq->rcu_head, _free_pirq_struct);
 }
 
+#endif /* CONFIG_HAS_PIRQ */
+
 struct migrate_info {
     long (*func)(void *data);
     void *data;
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -648,6 +648,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
         }
         break;
 
+#ifdef CONFIG_HAS_PIRQ
     case XEN_DOMCTL_irq_permission:
     {
         unsigned int pirq = op->u.irq_permission.pirq, irq;
@@ -667,6 +668,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
             ret = irq_deny_access(d, irq);
         break;
     }
+#endif
 
     case XEN_DOMCTL_iomem_permission:
     {
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -567,6 +567,7 @@ static int evtchn_bind_ipi(evtchn_bind_i
     return rc;
 }
 
+#ifdef CONFIG_HAS_PIRQ
 
 static void link_pirq_port(int port, struct evtchn *chn, struct vcpu *v)
 {
@@ -592,9 +593,11 @@ static void unlink_pirq_port(struct evtc
             chn->u.pirq.prev_port;
 }
 
+#endif /* CONFIG_HAS_PIRQ */
 
 static int evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
 {
+#ifdef CONFIG_HAS_PIRQ
     struct evtchn *chn;
     struct domain *d = current->domain;
     struct vcpu   *v = d->vcpu[0];
@@ -664,6 +667,9 @@ static int evtchn_bind_pirq(evtchn_bind_
     write_unlock(&d->event_lock);
 
     return rc;
+#else /* !CONFIG_HAS_PIRQ */
+    return -EOPNOTSUPP;
+#endif
 }
 
 
@@ -696,6 +702,7 @@ int evtchn_close(struct domain *d1, int
     case ECS_UNBOUND:
         break;
 
+#ifdef CONFIG_HAS_PIRQ
     case ECS_PIRQ: {
         struct pirq *pirq = pirq_info(d1, chn1->u.pirq.irq);
 
@@ -705,14 +712,13 @@ int evtchn_close(struct domain *d1, int
                 pirq_guest_unbind(d1, pirq);
             pirq->evtchn = 0;
             pirq_cleanup_check(pirq, d1);
-#ifdef CONFIG_X86
             if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
                 unmap_domain_pirq_emuirq(d1, pirq->pirq);
-#endif
         }
         unlink_pirq_port(chn1, d1->vcpu[chn1->notify_vcpu_id]);
         break;
     }
+#endif
 
     case ECS_VIRQ: {
         struct vcpu *v;
@@ -1122,6 +1128,8 @@ int evtchn_bind_vcpu(evtchn_port_t port,
     case ECS_INTERDOMAIN:
         chn->notify_vcpu_id = v->vcpu_id;
         break;
+
+#ifdef CONFIG_HAS_PIRQ
     case ECS_PIRQ:
         if ( chn->notify_vcpu_id == v->vcpu_id )
             break;
@@ -1131,6 +1139,8 @@ int evtchn_bind_vcpu(evtchn_port_t port,
                           cpumask_of(v->processor));
         link_pirq_port(port, chn, v);
         break;
+#endif
+
     default:
         rc = -EINVAL;
         break;
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -438,12 +438,14 @@ struct domain
 
     struct grant_table *grant_table;
 
+#ifdef CONFIG_HAS_PIRQ
     /*
      * Interrupt to event-channel mappings and other per-guest-pirq data.
      * Protected by the domain's event-channel spinlock.
      */
     struct radix_tree_root pirq_tree;
     unsigned int     nr_pirqs;
+#endif
 
     unsigned int     options;         /* copy of createdomain flags */
 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 2/3] pirq_cleanup_check() leaks
  2023-07-27  7:37 [PATCH v3 0/3] new CONFIG_HAS_PIRQ and extra_guest_irqs adjustment Jan Beulich
  2023-07-27  7:38 ` [PATCH v3 1/3] restrict concept of pIRQ to x86 Jan Beulich
@ 2023-07-27  7:38 ` Jan Beulich
  2024-07-01  8:55   ` Roger Pau Monné
  2023-07-27  7:38 ` [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds Jan Beulich
  2 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2023-07-27  7:38 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

Its original introduction had two issues: For one the "common" part of
the checks (carried out in the macro) was inverted. And then after
removal from the radix tree the structure wasn't scheduled for freeing.
(All structures still left in the radix tree would be freed upon domain
destruction, though.)

For the freeing to be safe even if it didn't use RCU (i.e. to avoid use-
after-free), re-arrange checks/operations in evtchn_close(), such that
the pointer wouldn't be used anymore after calling pirq_cleanup_check()
(noting that unmap_domain_pirq_emuirq() itself calls the function in the
success case).

Fixes: c24536b636f2 ("replace d->nr_pirqs sized arrays with radix tree")
Fixes: 79858fee307c ("xen: fix hvm_domain_use_pirq's behavior")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
This is fallout from me looking into whether the function and macro of
the same name could be suitably split, to please Misra rule 5.5. The
idea was apparently that the check done in the macro is the "common"
part, and the actual function would be per-architecture. Pretty clearly
this, if need be, could also be achieved by naming the actual function
e.g. arch_pirq_cleanup_check().

Despite my testing of this (to a certain degree), I'm wary of the
change, since the code has been the way it was for about 12 years. It
feels like I'm overlooking something crucial ...

The wrong check is also what explains why Arm got away without
implementing the function (prior to "restrict concept of pIRQ to x86"):
The compiler simply eliminated the two calls from event_channel.c.
---
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1349,6 +1349,7 @@ void (pirq_cleanup_check)(struct pirq *p
 
     if ( radix_tree_delete(&d->pirq_tree, pirq->pirq) != pirq )
         BUG();
+    free_pirq_struct(pirq);
 }
 
 /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -711,9 +711,10 @@ int evtchn_close(struct domain *d1, int
             if ( !is_hvm_domain(d1) )
                 pirq_guest_unbind(d1, pirq);
             pirq->evtchn = 0;
-            pirq_cleanup_check(pirq, d1);
-            if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
-                unmap_domain_pirq_emuirq(d1, pirq->pirq);
+            if ( !is_hvm_domain(d1) ||
+                 domain_pirq_to_irq(d1, pirq->pirq) <= 0 ||
+                 unmap_domain_pirq_emuirq(d1, pirq->pirq) < 0 )
+                pirq_cleanup_check(pirq, d1);
         }
         unlink_pirq_port(chn1, d1->vcpu[chn1->notify_vcpu_id]);
         break;
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -158,7 +158,7 @@ extern struct pirq *pirq_get_info(struct
 void pirq_cleanup_check(struct pirq *, struct domain *);
 
 #define pirq_cleanup_check(pirq, d) \
-    ((pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
+    (!(pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
 
 extern void pirq_guest_eoi(struct pirq *);
 extern void desc_guest_eoi(struct irq_desc *, struct pirq *);



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2023-07-27  7:37 [PATCH v3 0/3] new CONFIG_HAS_PIRQ and extra_guest_irqs adjustment Jan Beulich
  2023-07-27  7:38 ` [PATCH v3 1/3] restrict concept of pIRQ to x86 Jan Beulich
  2023-07-27  7:38 ` [PATCH v3 2/3] pirq_cleanup_check() leaks Jan Beulich
@ 2023-07-27  7:38 ` Jan Beulich
  2024-07-01  9:55   ` Roger Pau Monné
  2 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2023-07-27  7:38 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

PHYSDEVOP_pirq_eoi_gmfn_v<N> accepting just a single GFN implies that no
more than 32k pIRQ-s can be used by a domain on x86. Document this upper
bound.

To also enforce the limit, (ab)use both arch_hwdom_irqs() (changing its
parameter type) and setup_system_domains(). This is primarily to avoid
exposing the two static variables or introducing yet further arch hooks.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Instead of passing dom_xen into arch_hwdom_irqs(), NULL could also be
used. That would make the connection to setup_system_domains() yet more
weak, though.

Passing the domain pointer instead of the domain ID would also allow
to return a possibly different value if sensible for PVH Dom0 (which
presently has no access to PHYSDEVOP_pirq_eoi_gmfn_v<N> in the first
place).
---
v2: Also enforce these bounds. Adjust doc to constrain the bound to x86
    only. Re-base over new earlier patch.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1146,7 +1146,8 @@ common for all domUs, while the optional
 is for dom0.  Changing the setting for domU has no impact on dom0 and vice
 versa.  For example to change dom0 without changing domU, use
 `extra_guest_irqs=,512`.  The default value for Dom0 and an eventual separate
-hardware domain is architecture dependent.
+hardware domain is architecture dependent.  The upper limit for both values on
+x86 is such that the resulting total number of IRQs can't be higher than 32768.
 Note that specifying zero as domU value means zero, while for dom0 it means
 to use the default.
 
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -2663,18 +2663,21 @@ void __init ioapic_init(void)
            nr_irqs_gsi, nr_irqs - nr_irqs_gsi);
 }
 
-unsigned int arch_hwdom_irqs(domid_t domid)
+unsigned int arch_hwdom_irqs(const struct domain *d)
 {
     unsigned int n = fls(num_present_cpus());
 
-    if ( !domid )
+    if ( is_system_domain(d) )
+        return PAGE_SIZE * BITS_PER_BYTE;
+
+    if ( !d->domain_id )
         n = min(n, dom0_max_vcpus());
     n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
 
     /* Bounded by the domain pirq eoi bitmap gfn. */
     n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);
 
-    printk("Dom%d has maximum %u PIRQs\n", domid, n);
+    printk("%pd has maximum %u PIRQs\n", d, n);
 
     return n;
 }
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
             d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
         else
             d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
-                                           : arch_hwdom_irqs(domid);
+                                           : arch_hwdom_irqs(d);
         d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
 
         radix_tree_init(&d->pirq_tree);
@@ -819,6 +819,24 @@ void __init setup_system_domains(void)
     if ( IS_ERR(dom_xen) )
         panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
 
+#ifdef CONFIG_HAS_PIRQ
+    /* Bound-check values passed via "extra_guest_irqs=". */
+    {
+        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
+
+        if ( extra_hwdom_irqs > n - nr_static_irqs )
+        {
+            extra_hwdom_irqs = n - nr_static_irqs;
+            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
+        }
+        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
+        {
+            extra_domU_irqs = n - nr_static_irqs;
+            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
+        }
+    }
+#endif
+
     /*
      * Initialise our DOMID_IO domain.
      * This domain owns I/O pages that are within the range of the page_info
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -173,8 +173,9 @@ extern irq_desc_t *pirq_spin_lock_irq_de
 
 unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *);
 
+/* When passed a system domain, this returns the maximum permissible value. */
 #ifndef arch_hwdom_irqs
-unsigned int arch_hwdom_irqs(domid_t);
+unsigned int arch_hwdom_irqs(const struct domain *);
 #endif
 
 #ifndef arch_evtchn_bind_pirq



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/3] restrict concept of pIRQ to x86
  2023-07-27  7:38 ` [PATCH v3 1/3] restrict concept of pIRQ to x86 Jan Beulich
@ 2024-03-12 22:54   ` Julien Grall
  0 siblings, 0 replies; 20+ messages in thread
From: Julien Grall @ 2024-03-12 22:54 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné, Bertrand Marquis, Volodymyr Babchuk

Hi Jan,

On 27/07/2023 08:38, Jan Beulich wrote:
> ... by way of a new arch-selectable Kconfig control.
> 
> Note that some smaller pieces of code are left without #ifdef, to keep
> things better readable. Hence items like ECS_PIRQ, nr_static_irqs, or
> domain_pirq_to_irq() remain uniformly.
> 
> As to XEN_DOMCTL_irq_permission - this, despite having a uniformly
> available wrapper in libxc, is unused on Arm: libxl bypasses those
> calls, and the Python and OCamL bindings have no users at all.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Sorry for the late reply:

Acked-by: Julien Grall <jgrall@amazon.com>

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/3] pirq_cleanup_check() leaks
  2023-07-27  7:38 ` [PATCH v3 2/3] pirq_cleanup_check() leaks Jan Beulich
@ 2024-07-01  8:55   ` Roger Pau Monné
  2024-07-01  9:47     ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2024-07-01  8:55 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, George Dunlap,
	Julien Grall, Stefano Stabellini, Wei Liu

On Thu, Jul 27, 2023 at 09:38:29AM +0200, Jan Beulich wrote:
> Its original introduction had two issues: For one the "common" part of
> the checks (carried out in the macro) was inverted.

Is the current logic in evtchn_close() really malfunctioning?

pirq->evtchn = 0;
pirq_cleanup_check(pirq, d1); <- cleanup for PV domains
if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
    unmap_domain_pirq_emuirq(d1, pirq->pirq); <- cleanup for HVM domains

It would seem to me the pirq_cleanup_check() call just after setting
evtchn = 0 was done to account for PV domains, while the second
(hidden) pirq_cleanup_check() call in unmap_domain_pirq_emuirq() would
do the cleanup for HVM domains.

Maybe there's something that I'm missing, I have to admit the PIRQ
logic is awfully complicated, even more when we mix the HVM PIRQ
stuff.

> And then after
> removal from the radix tree the structure wasn't scheduled for freeing.
> (All structures still left in the radix tree would be freed upon domain
> destruction, though.)

So if my understanding is correct, we didn't have a leak due to the
missing free_pirq_struct() because the inverted check in
pirq_cleanup_check() macro prevented the removal from the radix tree,
and so stale entries would be left there and freed at domain
destruction?

> For the freeing to be safe even if it didn't use RCU (i.e. to avoid use-
> after-free), re-arrange checks/operations in evtchn_close(), such that
> the pointer wouldn't be used anymore after calling pirq_cleanup_check()
> (noting that unmap_domain_pirq_emuirq() itself calls the function in the
> success case).
> 
> Fixes: c24536b636f2 ("replace d->nr_pirqs sized arrays with radix tree")
> Fixes: 79858fee307c ("xen: fix hvm_domain_use_pirq's behavior")

See the comment above about the call to pirq_cleanup_check() in
unmap_domain_pirq_emuirq(), which might imply 79858fee307c is not
wrong, just confusing.

> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> This is fallout from me looking into whether the function and macro of
> the same name could be suitably split, to please Misra rule 5.5. The
> idea was apparently that the check done in the macro is the "common"
> part, and the actual function would be per-architecture. Pretty clearly
> this, if need be, could also be achieved by naming the actual function
> e.g. arch_pirq_cleanup_check().
> 
> Despite my testing of this (to a certain degree), I'm wary of the
> change, since the code has been the way it was for about 12 years. It
> feels like I'm overlooking something crucial ...
> 
> The wrong check is also what explains why Arm got away without
> implementing the function (prior to "restrict concept of pIRQ to x86"):
> The compiler simply eliminated the two calls from event_channel.c.
> ---
> v3: New.
> 
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -1349,6 +1349,7 @@ void (pirq_cleanup_check)(struct pirq *p
>  
>      if ( radix_tree_delete(&d->pirq_tree, pirq->pirq) != pirq )
>          BUG();
> +    free_pirq_struct(pirq);
>  }
>  
>  /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */
> --- a/xen/common/event_channel.c
> +++ b/xen/common/event_channel.c
> @@ -711,9 +711,10 @@ int evtchn_close(struct domain *d1, int
>              if ( !is_hvm_domain(d1) )
>                  pirq_guest_unbind(d1, pirq);
>              pirq->evtchn = 0;
> -            pirq_cleanup_check(pirq, d1);
> -            if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
> -                unmap_domain_pirq_emuirq(d1, pirq->pirq);
> +            if ( !is_hvm_domain(d1) ||
> +                 domain_pirq_to_irq(d1, pirq->pirq) <= 0 ||
> +                 unmap_domain_pirq_emuirq(d1, pirq->pirq) < 0 )

pirq_cleanup_check() already calls pirq_cleanup_check() itself.  Could
you please add a comment to note that unmap_domain_pirq_emuirq()
succeeding implies the call to pirq_cleanup_check() has already been
done?

Otherwise the logic here looks unbalanced by skipping the
pirq_cleanup_check() when unmap_domain_pirq_emuirq() succeeds.

> +                pirq_cleanup_check(pirq, d1);
>          }
>          unlink_pirq_port(chn1, d1->vcpu[chn1->notify_vcpu_id]);
>          break;
> --- a/xen/include/xen/irq.h
> +++ b/xen/include/xen/irq.h
> @@ -158,7 +158,7 @@ extern struct pirq *pirq_get_info(struct
>  void pirq_cleanup_check(struct pirq *, struct domain *);
>  
>  #define pirq_cleanup_check(pirq, d) \
> -    ((pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
> +    (!(pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)

Not that you need to fix it here, but why not place this check in
pirq_cleanup_check() itself?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/3] pirq_cleanup_check() leaks
  2024-07-01  8:55   ` Roger Pau Monné
@ 2024-07-01  9:47     ` Jan Beulich
  2024-07-01 11:13       ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2024-07-01  9:47 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, George Dunlap,
	Julien Grall, Stefano Stabellini, Wei Liu

On 01.07.2024 10:55, Roger Pau Monné wrote:
> On Thu, Jul 27, 2023 at 09:38:29AM +0200, Jan Beulich wrote:
>> Its original introduction had two issues: For one the "common" part of
>> the checks (carried out in the macro) was inverted.
> 
> Is the current logic in evtchn_close() really malfunctioning?

First: I'm getting the impression that this entire comment doesn't relate
to the part of the description above, but to the 2nd paragraph further
down. Otherwise I'm afraid I may not properly understand your question,
and hence my response below may not make any sense at all.

> pirq->evtchn = 0;
> pirq_cleanup_check(pirq, d1); <- cleanup for PV domains
> if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
>     unmap_domain_pirq_emuirq(d1, pirq->pirq); <- cleanup for HVM domains
> 
> It would seem to me the pirq_cleanup_check() call just after setting
> evtchn = 0 was done to account for PV domains, while the second
> (hidden) pirq_cleanup_check() call in unmap_domain_pirq_emuirq() would
> do the cleanup for HVM domains.
> 
> Maybe there's something that I'm missing, I have to admit the PIRQ
> logic is awfully complicated, even more when we mix the HVM PIRQ
> stuff.

If you look at pirq_cleanup_check() you'll notice that it takes care
of one HVM case as well (the not emuirq one, i.e. particularly PVH,
but note also how physdev_hvm_map_pirq() calls map_domain_emuirq_pirq()
only conditionally). Plus the crucial aspect of the 2nd paragraph of
the description is that past calling pirq_cleanup_check() it is not
really valid anymore to (blindly) de-reference the struct pirq pointer
we hold in hands. The is_hvm_domain() qualification wasn't enough,
since - as said - it's only one of the possibilities that would allow
the pirq to remain legal to use past the call, when having taken the
function's

        if ( pirq->arch.hvm.emuirq != IRQ_UNBOUND )
            return;

path. A 2nd would be taking the

        if ( !pt_pirq_cleanup_check(&pirq->arch.hvm.dpci) )
            return;

path (i.e. a still in use pass-through IRQ), but the 3rd would still
end in the struct pirq being purged even for HVM.

>> And then after
>> removal from the radix tree the structure wasn't scheduled for freeing.
>> (All structures still left in the radix tree would be freed upon domain
>> destruction, though.)
> 
> So if my understanding is correct, we didn't have a leak due to the
> missing free_pirq_struct() because the inverted check in
> pirq_cleanup_check() macro prevented the removal from the radix tree,
> and so stale entries would be left there and freed at domain
> destruction?

That's the understanding I had come to, yes. What I wasn't entirely
sure about (see the 2nd post-commit-message remark) is why the entry
being left in the radix tree never caused any problems. Presumably
that's a result of pirq_get_info() first checking whether an entry is
already there, allocating a new one only for previously empty slots.

>> --- a/xen/common/event_channel.c
>> +++ b/xen/common/event_channel.c
>> @@ -711,9 +711,10 @@ int evtchn_close(struct domain *d1, int
>>              if ( !is_hvm_domain(d1) )
>>                  pirq_guest_unbind(d1, pirq);
>>              pirq->evtchn = 0;
>> -            pirq_cleanup_check(pirq, d1);
>> -            if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
>> -                unmap_domain_pirq_emuirq(d1, pirq->pirq);
>> +            if ( !is_hvm_domain(d1) ||
>> +                 domain_pirq_to_irq(d1, pirq->pirq) <= 0 ||
>> +                 unmap_domain_pirq_emuirq(d1, pirq->pirq) < 0 )
> 
> pirq_cleanup_check() already calls pirq_cleanup_check() itself.  Could
> you please add a comment to note that unmap_domain_pirq_emuirq()
> succeeding implies the call to pirq_cleanup_check() has already been
> done?
> 
> Otherwise the logic here looks unbalanced by skipping the
> pirq_cleanup_check() when unmap_domain_pirq_emuirq() succeeds.

Sure, added:

                /*
                 * The successful path of unmap_domain_pirq_emuirq() will have
                 * called pirq_cleanup_check() already.
                 */

>> --- a/xen/include/xen/irq.h
>> +++ b/xen/include/xen/irq.h
>> @@ -158,7 +158,7 @@ extern struct pirq *pirq_get_info(struct
>>  void pirq_cleanup_check(struct pirq *, struct domain *);
>>  
>>  #define pirq_cleanup_check(pirq, d) \
>> -    ((pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
>> +    (!(pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
> 
> Not that you need to fix it here, but why not place this check in
> pirq_cleanup_check() itself?

See the first of the post-commit-message remarks: The goal was to not
require every arch to replicate that check. At the time it wasn't
clear (to me at least) that the entire concept of pIRQ would likely
remain an x86 special thing anyway.

Jan


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2023-07-27  7:38 ` [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds Jan Beulich
@ 2024-07-01  9:55   ` Roger Pau Monné
  2024-07-01 10:40     ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2024-07-01  9:55 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, George Dunlap,
	Julien Grall, Stefano Stabellini, Wei Liu

On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
> PHYSDEVOP_pirq_eoi_gmfn_v<N> accepting just a single GFN implies that no
> more than 32k pIRQ-s can be used by a domain on x86. Document this upper
> bound.
> 
> To also enforce the limit, (ab)use both arch_hwdom_irqs() (changing its
> parameter type) and setup_system_domains(). This is primarily to avoid
> exposing the two static variables or introducing yet further arch hooks.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Instead of passing dom_xen into arch_hwdom_irqs(), NULL could also be
> used. That would make the connection to setup_system_domains() yet more
> weak, though.
> 
> Passing the domain pointer instead of the domain ID would also allow
> to return a possibly different value if sensible for PVH Dom0 (which
> presently has no access to PHYSDEVOP_pirq_eoi_gmfn_v<N> in the first
> place).
> ---
> v2: Also enforce these bounds. Adjust doc to constrain the bound to x86
>     only. Re-base over new earlier patch.
> 
> --- a/docs/misc/xen-command-line.pandoc
> +++ b/docs/misc/xen-command-line.pandoc
> @@ -1146,7 +1146,8 @@ common for all domUs, while the optional
>  is for dom0.  Changing the setting for domU has no impact on dom0 and vice
>  versa.  For example to change dom0 without changing domU, use
>  `extra_guest_irqs=,512`.  The default value for Dom0 and an eventual separate
> -hardware domain is architecture dependent.
> +hardware domain is architecture dependent.  The upper limit for both values on
> +x86 is such that the resulting total number of IRQs can't be higher than 32768.
>  Note that specifying zero as domU value means zero, while for dom0 it means
>  to use the default.
>  
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -2663,18 +2663,21 @@ void __init ioapic_init(void)
>             nr_irqs_gsi, nr_irqs - nr_irqs_gsi);
>  }
>  
> -unsigned int arch_hwdom_irqs(domid_t domid)
> +unsigned int arch_hwdom_irqs(const struct domain *d)

While at it, should this be __hwdom_init?

I'm fine with changing the function to take a domain parameter...

>  {
>      unsigned int n = fls(num_present_cpus());
>  
> -    if ( !domid )
> +    if ( is_system_domain(d) )
> +        return PAGE_SIZE * BITS_PER_BYTE;

... but why do we need a function call just to get a constant value?
Wouldn't this better be a define in a header?

> +
> +    if ( !d->domain_id )
>          n = min(n, dom0_max_vcpus());
>      n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
>  
>      /* Bounded by the domain pirq eoi bitmap gfn. */
>      n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);

So that could also use the same constant here?

> -    printk("Dom%d has maximum %u PIRQs\n", domid, n);
> +    printk("%pd has maximum %u PIRQs\n", d, n);
>  
>      return n;
>  }
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
>          else
>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
> -                                           : arch_hwdom_irqs(domid);
> +                                           : arch_hwdom_irqs(d);
>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
>  
>          radix_tree_init(&d->pirq_tree);
> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
>      if ( IS_ERR(dom_xen) )
>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
>  
> +#ifdef CONFIG_HAS_PIRQ
> +    /* Bound-check values passed via "extra_guest_irqs=". */
> +    {
> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
> +
> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
> +        {
> +            extra_hwdom_irqs = n - nr_static_irqs;
> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
> +        }
> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
> +        {
> +            extra_domU_irqs = n - nr_static_irqs;
> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
> +        }
> +    }
> +#endif

IMO this is kind of a weird placement. Wouldn't this be more naturally
handled in parse_extra_guest_irqs()?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-01  9:55   ` Roger Pau Monné
@ 2024-07-01 10:40     ` Jan Beulich
  2024-07-01 13:29       ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2024-07-01 10:40 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On 01.07.2024 11:55, Roger Pau Monné wrote:
> On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
>> --- a/xen/arch/x86/io_apic.c
>> +++ b/xen/arch/x86/io_apic.c
>> @@ -2663,18 +2663,21 @@ void __init ioapic_init(void)
>>             nr_irqs_gsi, nr_irqs - nr_irqs_gsi);
>>  }
>>  
>> -unsigned int arch_hwdom_irqs(domid_t domid)
>> +unsigned int arch_hwdom_irqs(const struct domain *d)
> 
> While at it, should this be __hwdom_init?

It indeed can be, so I've done this for v4.

> I'm fine with changing the function to take a domain parameter...
> 
>>  {
>>      unsigned int n = fls(num_present_cpus());
>>  
>> -    if ( !domid )
>> +    if ( is_system_domain(d) )
>> +        return PAGE_SIZE * BITS_PER_BYTE;
> 
> ... but why do we need a function call just to get a constant value?
> Wouldn't this better be a define in a header?

Would be an option, but would result in parts of the logic living is
distinct places.

>> +
>> +    if ( !d->domain_id )
>>          n = min(n, dom0_max_vcpus());
>>      n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
>>  
>>      /* Bounded by the domain pirq eoi bitmap gfn. */
>>      n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);
> 
> So that could also use the same constant here?
> 
>> -    printk("Dom%d has maximum %u PIRQs\n", domid, n);
>> +    printk("%pd has maximum %u PIRQs\n", d, n);
>>  
>>      return n;
>>  }
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
>>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
>>          else
>>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
>> -                                           : arch_hwdom_irqs(domid);
>> +                                           : arch_hwdom_irqs(d);
>>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
>>  
>>          radix_tree_init(&d->pirq_tree);
>> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
>>      if ( IS_ERR(dom_xen) )
>>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
>>  
>> +#ifdef CONFIG_HAS_PIRQ
>> +    /* Bound-check values passed via "extra_guest_irqs=". */
>> +    {
>> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
>> +
>> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
>> +        {
>> +            extra_hwdom_irqs = n - nr_static_irqs;
>> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
>> +        }
>> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
>> +        {
>> +            extra_domU_irqs = n - nr_static_irqs;
>> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
>> +        }
>> +    }
>> +#endif
> 
> IMO this is kind of a weird placement. Wouldn't this be more naturally
> handled in parse_extra_guest_irqs()?

Indeed it is and yes it would, but no, it can't. We shouldn't rely on
the particular behavior of arch_hwdom_irqs(), and in the general case
we can't call it as early as when command line arguments are parsed. I
couldn't think of a neater way of doing this, and it not being pretty
is why I'm saying "(ab)use" in the description.

Jan


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/3] pirq_cleanup_check() leaks
  2024-07-01  9:47     ` Jan Beulich
@ 2024-07-01 11:13       ` Roger Pau Monné
  2024-07-01 13:21         ` [PATCH v3 for-4.19 " Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2024-07-01 11:13 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, George Dunlap,
	Julien Grall, Stefano Stabellini, Wei Liu

On Mon, Jul 01, 2024 at 11:47:34AM +0200, Jan Beulich wrote:
> On 01.07.2024 10:55, Roger Pau Monné wrote:
> > On Thu, Jul 27, 2023 at 09:38:29AM +0200, Jan Beulich wrote:
> >> Its original introduction had two issues: For one the "common" part of
> >> the checks (carried out in the macro) was inverted.
> > 
> > Is the current logic in evtchn_close() really malfunctioning?
> 
> First: I'm getting the impression that this entire comment doesn't relate
> to the part of the description above, but to the 2nd paragraph further
> down. Otherwise I'm afraid I may not properly understand your question,
> and hence my response below may not make any sense at all.
> 
> > pirq->evtchn = 0;
> > pirq_cleanup_check(pirq, d1); <- cleanup for PV domains
> > if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
> >     unmap_domain_pirq_emuirq(d1, pirq->pirq); <- cleanup for HVM domains
> > 
> > It would seem to me the pirq_cleanup_check() call just after setting
> > evtchn = 0 was done to account for PV domains, while the second
> > (hidden) pirq_cleanup_check() call in unmap_domain_pirq_emuirq() would
> > do the cleanup for HVM domains.
> > 
> > Maybe there's something that I'm missing, I have to admit the PIRQ
> > logic is awfully complicated, even more when we mix the HVM PIRQ
> > stuff.
> 
> If you look at pirq_cleanup_check() you'll notice that it takes care
> of one HVM case as well (the not emuirq one, i.e. particularly PVH,
> but note also how physdev_hvm_map_pirq() calls map_domain_emuirq_pirq()
> only conditionally). Plus the crucial aspect of the 2nd paragraph of
> the description is that past calling pirq_cleanup_check() it is not
> really valid anymore to (blindly) de-reference the struct pirq pointer
> we hold in hands. The is_hvm_domain() qualification wasn't enough,
> since - as said - it's only one of the possibilities that would allow
> the pirq to remain legal to use past the call, when having taken the
> function's
> 
>         if ( pirq->arch.hvm.emuirq != IRQ_UNBOUND )
>             return;
> 
> path. A 2nd would be taking the
> 
>         if ( !pt_pirq_cleanup_check(&pirq->arch.hvm.dpci) )
>             return;
> 
> path (i.e. a still in use pass-through IRQ), but the 3rd would still
> end in the struct pirq being purged even for HVM.

Right, I was missing that if pirq is properly freed then further
usages of it after the pirq_cleanup_check() would be use after free.

> >> And then after
> >> removal from the radix tree the structure wasn't scheduled for freeing.
> >> (All structures still left in the radix tree would be freed upon domain
> >> destruction, though.)
> > 
> > So if my understanding is correct, we didn't have a leak due to the
> > missing free_pirq_struct() because the inverted check in
> > pirq_cleanup_check() macro prevented the removal from the radix tree,
> > and so stale entries would be left there and freed at domain
> > destruction?
> 
> That's the understanding I had come to, yes. What I wasn't entirely
> sure about (see the 2nd post-commit-message remark) is why the entry
> being left in the radix tree never caused any problems. Presumably
> that's a result of pirq_get_info() first checking whether an entry is
> already there, allocating a new one only for previously empty slots.

Yes, I came to the same conclusion, that not freeing wasn't an issue
as Xen would re-use the old entry.  Hopefully it's clean enough to not
cause issues when re-using.

> >> --- a/xen/common/event_channel.c
> >> +++ b/xen/common/event_channel.c
> >> @@ -711,9 +711,10 @@ int evtchn_close(struct domain *d1, int
> >>              if ( !is_hvm_domain(d1) )
> >>                  pirq_guest_unbind(d1, pirq);
> >>              pirq->evtchn = 0;
> >> -            pirq_cleanup_check(pirq, d1);
> >> -            if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
> >> -                unmap_domain_pirq_emuirq(d1, pirq->pirq);
> >> +            if ( !is_hvm_domain(d1) ||
> >> +                 domain_pirq_to_irq(d1, pirq->pirq) <= 0 ||
> >> +                 unmap_domain_pirq_emuirq(d1, pirq->pirq) < 0 )
> > 
> > pirq_cleanup_check() already calls pirq_cleanup_check() itself.  Could
> > you please add a comment to note that unmap_domain_pirq_emuirq()
> > succeeding implies the call to pirq_cleanup_check() has already been
> > done?
> > 
> > Otherwise the logic here looks unbalanced by skipping the
> > pirq_cleanup_check() when unmap_domain_pirq_emuirq() succeeds.
> 
> Sure, added:
> 
>                 /*
>                  * The successful path of unmap_domain_pirq_emuirq() will have
>                  * called pirq_cleanup_check() already.
>                  */

With that added:

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

> >> --- a/xen/include/xen/irq.h
> >> +++ b/xen/include/xen/irq.h
> >> @@ -158,7 +158,7 @@ extern struct pirq *pirq_get_info(struct
> >>  void pirq_cleanup_check(struct pirq *, struct domain *);
> >>  
> >>  #define pirq_cleanup_check(pirq, d) \
> >> -    ((pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
> >> +    (!(pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
> > 
> > Not that you need to fix it here, but why not place this check in
> > pirq_cleanup_check() itself?
> 
> See the first of the post-commit-message remarks: The goal was to not
> require every arch to replicate that check. At the time it wasn't
> clear (to me at least) that the entire concept of pIRQ would likely
> remain an x86 special thing anyway.

Anyway, such change would better be done in a separate commit anyway.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 for-4.19 2/3] pirq_cleanup_check() leaks
  2024-07-01 11:13       ` Roger Pau Monné
@ 2024-07-01 13:21         ` Jan Beulich
  2024-07-01 15:05           ` Oleksii
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2024-07-01 13:21 UTC (permalink / raw)
  To: Roger Pau Monné, Oleksii Kurochko
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On 01.07.2024 13:13, Roger Pau Monné wrote:
> On Mon, Jul 01, 2024 at 11:47:34AM +0200, Jan Beulich wrote:
>> On 01.07.2024 10:55, Roger Pau Monné wrote:
>>> On Thu, Jul 27, 2023 at 09:38:29AM +0200, Jan Beulich wrote:
>>>> Its original introduction had two issues: For one the "common" part of
>>>> the checks (carried out in the macro) was inverted.
>>>
>>> Is the current logic in evtchn_close() really malfunctioning?
>>
>> First: I'm getting the impression that this entire comment doesn't relate
>> to the part of the description above, but to the 2nd paragraph further
>> down. Otherwise I'm afraid I may not properly understand your question,
>> and hence my response below may not make any sense at all.
>>
>>> pirq->evtchn = 0;
>>> pirq_cleanup_check(pirq, d1); <- cleanup for PV domains
>>> if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
>>>     unmap_domain_pirq_emuirq(d1, pirq->pirq); <- cleanup for HVM domains
>>>
>>> It would seem to me the pirq_cleanup_check() call just after setting
>>> evtchn = 0 was done to account for PV domains, while the second
>>> (hidden) pirq_cleanup_check() call in unmap_domain_pirq_emuirq() would
>>> do the cleanup for HVM domains.
>>>
>>> Maybe there's something that I'm missing, I have to admit the PIRQ
>>> logic is awfully complicated, even more when we mix the HVM PIRQ
>>> stuff.
>>
>> If you look at pirq_cleanup_check() you'll notice that it takes care
>> of one HVM case as well (the not emuirq one, i.e. particularly PVH,
>> but note also how physdev_hvm_map_pirq() calls map_domain_emuirq_pirq()
>> only conditionally). Plus the crucial aspect of the 2nd paragraph of
>> the description is that past calling pirq_cleanup_check() it is not
>> really valid anymore to (blindly) de-reference the struct pirq pointer
>> we hold in hands. The is_hvm_domain() qualification wasn't enough,
>> since - as said - it's only one of the possibilities that would allow
>> the pirq to remain legal to use past the call, when having taken the
>> function's
>>
>>         if ( pirq->arch.hvm.emuirq != IRQ_UNBOUND )
>>             return;
>>
>> path. A 2nd would be taking the
>>
>>         if ( !pt_pirq_cleanup_check(&pirq->arch.hvm.dpci) )
>>             return;
>>
>> path (i.e. a still in use pass-through IRQ), but the 3rd would still
>> end in the struct pirq being purged even for HVM.
> 
> Right, I was missing that if pirq is properly freed then further
> usages of it after the pirq_cleanup_check() would be use after free.
> 
>>>> And then after
>>>> removal from the radix tree the structure wasn't scheduled for freeing.
>>>> (All structures still left in the radix tree would be freed upon domain
>>>> destruction, though.)
>>>
>>> So if my understanding is correct, we didn't have a leak due to the
>>> missing free_pirq_struct() because the inverted check in
>>> pirq_cleanup_check() macro prevented the removal from the radix tree,
>>> and so stale entries would be left there and freed at domain
>>> destruction?
>>
>> That's the understanding I had come to, yes. What I wasn't entirely
>> sure about (see the 2nd post-commit-message remark) is why the entry
>> being left in the radix tree never caused any problems. Presumably
>> that's a result of pirq_get_info() first checking whether an entry is
>> already there, allocating a new one only for previously empty slots.
> 
> Yes, I came to the same conclusion, that not freeing wasn't an issue
> as Xen would re-use the old entry.  Hopefully it's clean enough to not
> cause issues when re-using.
> 
>>>> --- a/xen/common/event_channel.c
>>>> +++ b/xen/common/event_channel.c
>>>> @@ -711,9 +711,10 @@ int evtchn_close(struct domain *d1, int
>>>>              if ( !is_hvm_domain(d1) )
>>>>                  pirq_guest_unbind(d1, pirq);
>>>>              pirq->evtchn = 0;
>>>> -            pirq_cleanup_check(pirq, d1);
>>>> -            if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 )
>>>> -                unmap_domain_pirq_emuirq(d1, pirq->pirq);
>>>> +            if ( !is_hvm_domain(d1) ||
>>>> +                 domain_pirq_to_irq(d1, pirq->pirq) <= 0 ||
>>>> +                 unmap_domain_pirq_emuirq(d1, pirq->pirq) < 0 )
>>>
>>> pirq_cleanup_check() already calls pirq_cleanup_check() itself.  Could
>>> you please add a comment to note that unmap_domain_pirq_emuirq()
>>> succeeding implies the call to pirq_cleanup_check() has already been
>>> done?
>>>
>>> Otherwise the logic here looks unbalanced by skipping the
>>> pirq_cleanup_check() when unmap_domain_pirq_emuirq() succeeds.
>>
>> Sure, added:
>>
>>                 /*
>>                  * The successful path of unmap_domain_pirq_emuirq() will have
>>                  * called pirq_cleanup_check() already.
>>                  */
> 
> With that added:
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks Roger.

Oleksii - would you please consider giving this long-standing bug fix a
release ack?

Thanks, Jan


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-01 10:40     ` Jan Beulich
@ 2024-07-01 13:29       ` Roger Pau Monné
  2024-07-01 15:07         ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2024-07-01 13:29 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On Mon, Jul 01, 2024 at 12:40:35PM +0200, Jan Beulich wrote:
> On 01.07.2024 11:55, Roger Pau Monné wrote:
> > On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
> >> --- a/xen/arch/x86/io_apic.c
> >> +++ b/xen/arch/x86/io_apic.c
> >> @@ -2663,18 +2663,21 @@ void __init ioapic_init(void)
> >>             nr_irqs_gsi, nr_irqs - nr_irqs_gsi);
> >>  }
> >>  
> >> -unsigned int arch_hwdom_irqs(domid_t domid)
> >> +unsigned int arch_hwdom_irqs(const struct domain *d)
> > 
> > While at it, should this be __hwdom_init?
> 
> It indeed can be, so I've done this for v4.
> 
> > I'm fine with changing the function to take a domain parameter...
> > 
> >>  {
> >>      unsigned int n = fls(num_present_cpus());
> >>  
> >> -    if ( !domid )
> >> +    if ( is_system_domain(d) )
> >> +        return PAGE_SIZE * BITS_PER_BYTE;
> > 
> > ... but why do we need a function call just to get a constant value?
> > Wouldn't this better be a define in a header?
> 
> Would be an option, but would result in parts of the logic living is
> distinct places.
> 
> >> +
> >> +    if ( !d->domain_id )
> >>          n = min(n, dom0_max_vcpus());
> >>      n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
> >>  
> >>      /* Bounded by the domain pirq eoi bitmap gfn. */
> >>      n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);
> > 
> > So that could also use the same constant here?

I would have a slight preference for PAGE_SIZE * BITS_PER_BYTE being
defined inside of this function as:

/* Bounded by the domain pirq eoi bitmap gfn. */
const unsigned int max_irqs = PAGE_SIZE * BITS_PER_BYTE;

Or similar for clarity purposes.

While at it, I've noticed that PHYSDEVOP_pirq_eoi_gmfn_v{1,2} is not
available to HVM guests (not even when exposing PIRQ support) and
hence I wonder if we should special case PVH dom0, but maybe it's best
to deal with this properly rather than hacking something special
just for PVH dom0.  At the end of the day the current limit is high
enough to not cause issues on current systems I would expect.

> >> -    printk("Dom%d has maximum %u PIRQs\n", domid, n);
> >> +    printk("%pd has maximum %u PIRQs\n", d, n);
> >>  
> >>      return n;
> >>  }
> >> --- a/xen/common/domain.c
> >> +++ b/xen/common/domain.c
> >> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
> >>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
> >>          else
> >>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
> >> -                                           : arch_hwdom_irqs(domid);
> >> +                                           : arch_hwdom_irqs(d);
> >>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
> >>  
> >>          radix_tree_init(&d->pirq_tree);
> >> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
> >>      if ( IS_ERR(dom_xen) )
> >>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
> >>  
> >> +#ifdef CONFIG_HAS_PIRQ
> >> +    /* Bound-check values passed via "extra_guest_irqs=". */
> >> +    {
> >> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
> >> +
> >> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
> >> +        {
> >> +            extra_hwdom_irqs = n - nr_static_irqs;
> >> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
> >> +        }
> >> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
> >> +        {
> >> +            extra_domU_irqs = n - nr_static_irqs;
> >> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
> >> +        }
> >> +    }
> >> +#endif
> > 
> > IMO this is kind of a weird placement. Wouldn't this be more naturally
> > handled in parse_extra_guest_irqs()?
> 
> Indeed it is and yes it would, but no, it can't. We shouldn't rely on
> the particular behavior of arch_hwdom_irqs(), and in the general case
> we can't call it as early as when command line arguments are parsed. I
> couldn't think of a neater way of doing this, and it not being pretty
> is why I'm saying "(ab)use" in the description.

I see, nr_static_irqs is an alias of nr_irqs_gsi, which is not properly
set by the time we evaluate command line arguments.

My only possible suggestion would be to do it as a presmp initcall,
and define/register such initcall for x86 only, the only benefit would
be that such inicall could be defined in the same translation unit as
arch_hwdom_irqs() then.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 for-4.19 2/3] pirq_cleanup_check() leaks
  2024-07-01 13:21         ` [PATCH v3 for-4.19 " Jan Beulich
@ 2024-07-01 15:05           ` Oleksii
  0 siblings, 0 replies; 20+ messages in thread
From: Oleksii @ 2024-07-01 15:05 UTC (permalink / raw)
  To: Jan Beulich, Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On Mon, 2024-07-01 at 15:21 +0200, Jan Beulich wrote:
> On 01.07.2024 13:13, Roger Pau Monné wrote:
> > On Mon, Jul 01, 2024 at 11:47:34AM +0200, Jan Beulich wrote:
> > > On 01.07.2024 10:55, Roger Pau Monné wrote:
> > > > On Thu, Jul 27, 2023 at 09:38:29AM +0200, Jan Beulich wrote:
> > > > > Its original introduction had two issues: For one the
> > > > > "common" part of
> > > > > the checks (carried out in the macro) was inverted.
> > > > 
> > > > Is the current logic in evtchn_close() really malfunctioning?
> > > 
> > > First: I'm getting the impression that this entire comment
> > > doesn't relate
> > > to the part of the description above, but to the 2nd paragraph
> > > further
> > > down. Otherwise I'm afraid I may not properly understand your
> > > question,
> > > and hence my response below may not make any sense at all.
> > > 
> > > > pirq->evtchn = 0;
> > > > pirq_cleanup_check(pirq, d1); <- cleanup for PV domains
> > > > if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) >
> > > > 0 )
> > > >     unmap_domain_pirq_emuirq(d1, pirq->pirq); <- cleanup for
> > > > HVM domains
> > > > 
> > > > It would seem to me the pirq_cleanup_check() call just after
> > > > setting
> > > > evtchn = 0 was done to account for PV domains, while the second
> > > > (hidden) pirq_cleanup_check() call in
> > > > unmap_domain_pirq_emuirq() would
> > > > do the cleanup for HVM domains.
> > > > 
> > > > Maybe there's something that I'm missing, I have to admit the
> > > > PIRQ
> > > > logic is awfully complicated, even more when we mix the HVM
> > > > PIRQ
> > > > stuff.
> > > 
> > > If you look at pirq_cleanup_check() you'll notice that it takes
> > > care
> > > of one HVM case as well (the not emuirq one, i.e. particularly
> > > PVH,
> > > but note also how physdev_hvm_map_pirq() calls
> > > map_domain_emuirq_pirq()
> > > only conditionally). Plus the crucial aspect of the 2nd paragraph
> > > of
> > > the description is that past calling pirq_cleanup_check() it is
> > > not
> > > really valid anymore to (blindly) de-reference the struct pirq
> > > pointer
> > > we hold in hands. The is_hvm_domain() qualification wasn't
> > > enough,
> > > since - as said - it's only one of the possibilities that would
> > > allow
> > > the pirq to remain legal to use past the call, when having taken
> > > the
> > > function's
> > > 
> > >         if ( pirq->arch.hvm.emuirq != IRQ_UNBOUND )
> > >             return;
> > > 
> > > path. A 2nd would be taking the
> > > 
> > >         if ( !pt_pirq_cleanup_check(&pirq->arch.hvm.dpci) )
> > >             return;
> > > 
> > > path (i.e. a still in use pass-through IRQ), but the 3rd would
> > > still
> > > end in the struct pirq being purged even for HVM.
> > 
> > Right, I was missing that if pirq is properly freed then further
> > usages of it after the pirq_cleanup_check() would be use after
> > free.
> > 
> > > > > And then after
> > > > > removal from the radix tree the structure wasn't scheduled
> > > > > for freeing.
> > > > > (All structures still left in the radix tree would be freed
> > > > > upon domain
> > > > > destruction, though.)
> > > > 
> > > > So if my understanding is correct, we didn't have a leak due to
> > > > the
> > > > missing free_pirq_struct() because the inverted check in
> > > > pirq_cleanup_check() macro prevented the removal from the radix
> > > > tree,
> > > > and so stale entries would be left there and freed at domain
> > > > destruction?
> > > 
> > > That's the understanding I had come to, yes. What I wasn't
> > > entirely
> > > sure about (see the 2nd post-commit-message remark) is why the
> > > entry
> > > being left in the radix tree never caused any problems.
> > > Presumably
> > > that's a result of pirq_get_info() first checking whether an
> > > entry is
> > > already there, allocating a new one only for previously empty
> > > slots.
> > 
> > Yes, I came to the same conclusion, that not freeing wasn't an
> > issue
> > as Xen would re-use the old entry.  Hopefully it's clean enough to
> > not
> > cause issues when re-using.
> > 
> > > > > --- a/xen/common/event_channel.c
> > > > > +++ b/xen/common/event_channel.c
> > > > > @@ -711,9 +711,10 @@ int evtchn_close(struct domain *d1, int
> > > > >              if ( !is_hvm_domain(d1) )
> > > > >                  pirq_guest_unbind(d1, pirq);
> > > > >              pirq->evtchn = 0;
> > > > > -            pirq_cleanup_check(pirq, d1);
> > > > > -            if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1,
> > > > > pirq->pirq) > 0 )
> > > > > -                unmap_domain_pirq_emuirq(d1, pirq->pirq);
> > > > > +            if ( !is_hvm_domain(d1) ||
> > > > > +                 domain_pirq_to_irq(d1, pirq->pirq) <= 0 ||
> > > > > +                 unmap_domain_pirq_emuirq(d1, pirq->pirq) <
> > > > > 0 )
> > > > 
> > > > pirq_cleanup_check() already calls pirq_cleanup_check()
> > > > itself.  Could
> > > > you please add a comment to note that
> > > > unmap_domain_pirq_emuirq()
> > > > succeeding implies the call to pirq_cleanup_check() has already
> > > > been
> > > > done?
> > > > 
> > > > Otherwise the logic here looks unbalanced by skipping the
> > > > pirq_cleanup_check() when unmap_domain_pirq_emuirq() succeeds.
> > > 
> > > Sure, added:
> > > 
> > >                 /*
> > >                  * The successful path of
> > > unmap_domain_pirq_emuirq() will have
> > >                  * called pirq_cleanup_check() already.
> > >                  */
> > 
> > With that added:
> > 
> > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Thanks Roger.
> 
> Oleksii - would you please consider giving this long-standing bug fix
> a
> release ack?
Release-Acked-By: Oleksii Kurochko <oleksii.kurochko@gmail.com>

~ Oleksii


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-01 13:29       ` Roger Pau Monné
@ 2024-07-01 15:07         ` Jan Beulich
  2024-07-01 15:17           ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2024-07-01 15:07 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On 01.07.2024 15:29, Roger Pau Monné wrote:
> On Mon, Jul 01, 2024 at 12:40:35PM +0200, Jan Beulich wrote:
>> On 01.07.2024 11:55, Roger Pau Monné wrote:
>>> On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/io_apic.c
>>>> +++ b/xen/arch/x86/io_apic.c
>>>> @@ -2663,18 +2663,21 @@ void __init ioapic_init(void)
>>>>             nr_irqs_gsi, nr_irqs - nr_irqs_gsi);
>>>>  }
>>>>  
>>>> -unsigned int arch_hwdom_irqs(domid_t domid)
>>>> +unsigned int arch_hwdom_irqs(const struct domain *d)
>>>
>>> While at it, should this be __hwdom_init?
>>
>> It indeed can be, so I've done this for v4.
>>
>>> I'm fine with changing the function to take a domain parameter...
>>>
>>>>  {
>>>>      unsigned int n = fls(num_present_cpus());
>>>>  
>>>> -    if ( !domid )
>>>> +    if ( is_system_domain(d) )
>>>> +        return PAGE_SIZE * BITS_PER_BYTE;
>>>
>>> ... but why do we need a function call just to get a constant value?
>>> Wouldn't this better be a define in a header?
>>
>> Would be an option, but would result in parts of the logic living is
>> distinct places.
>>
>>>> +
>>>> +    if ( !d->domain_id )
>>>>          n = min(n, dom0_max_vcpus());
>>>>      n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
>>>>  
>>>>      /* Bounded by the domain pirq eoi bitmap gfn. */
>>>>      n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);
>>>
>>> So that could also use the same constant here?
> 
> I would have a slight preference for PAGE_SIZE * BITS_PER_BYTE being
> defined inside of this function as:
> 
> /* Bounded by the domain pirq eoi bitmap gfn. */
> const unsigned int max_irqs = PAGE_SIZE * BITS_PER_BYTE;
> 
> Or similar for clarity purposes.

Can do, sure.

> While at it, I've noticed that PHYSDEVOP_pirq_eoi_gmfn_v{1,2} is not
> available to HVM guests (not even when exposing PIRQ support) and
> hence I wonder if we should special case PVH dom0, but maybe it's best
> to deal with this properly rather than hacking something special
> just for PVH dom0.  At the end of the day the current limit is high
> enough to not cause issues on current systems I would expect.

Oh, so entirely the other way around than mentioned when we talked, once
again due to the filtering in hvm/hypercall.h that I keep forgetting. So
in principle we could avoid the bounding for HVM. Just that right now
extra_domU_irqs covers both PV and HVM, and would hence need splitting
first.

>>>> --- a/xen/common/domain.c
>>>> +++ b/xen/common/domain.c
>>>> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
>>>>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
>>>>          else
>>>>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
>>>> -                                           : arch_hwdom_irqs(domid);
>>>> +                                           : arch_hwdom_irqs(d);
>>>>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
>>>>  
>>>>          radix_tree_init(&d->pirq_tree);
>>>> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
>>>>      if ( IS_ERR(dom_xen) )
>>>>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
>>>>  
>>>> +#ifdef CONFIG_HAS_PIRQ
>>>> +    /* Bound-check values passed via "extra_guest_irqs=". */
>>>> +    {
>>>> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
>>>> +
>>>> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
>>>> +        {
>>>> +            extra_hwdom_irqs = n - nr_static_irqs;
>>>> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
>>>> +        }
>>>> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
>>>> +        {
>>>> +            extra_domU_irqs = n - nr_static_irqs;
>>>> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
>>>> +        }
>>>> +    }
>>>> +#endif
>>>
>>> IMO this is kind of a weird placement. Wouldn't this be more naturally
>>> handled in parse_extra_guest_irqs()?
>>
>> Indeed it is and yes it would, but no, it can't. We shouldn't rely on
>> the particular behavior of arch_hwdom_irqs(), and in the general case
>> we can't call it as early as when command line arguments are parsed. I
>> couldn't think of a neater way of doing this, and it not being pretty
>> is why I'm saying "(ab)use" in the description.
> 
> I see, nr_static_irqs is an alias of nr_irqs_gsi, which is not properly
> set by the time we evaluate command line arguments.
> 
> My only possible suggestion would be to do it as a presmp initcall,
> and define/register such initcall for x86 only, the only benefit would
> be that such inicall could be defined in the same translation unit as
> arch_hwdom_irqs() then.

Which then would require making extra_{hwdom,domU}_irqs available to
x86/io_apic.c, which also wouldn't be very nice. To be honest, I'd prefer
to keep the logic where it is, until such time where perhaps we move pIRQ
stuff wholesale to x86-only files.

Jan


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-01 15:07         ` Jan Beulich
@ 2024-07-01 15:17           ` Roger Pau Monné
  2024-07-02  8:26             ` [PATCH v3 for-4.19? " Jan Beulich
  2024-07-02  8:30             ` [PATCH v3 " Jan Beulich
  0 siblings, 2 replies; 20+ messages in thread
From: Roger Pau Monné @ 2024-07-01 15:17 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On Mon, Jul 01, 2024 at 05:07:19PM +0200, Jan Beulich wrote:
> On 01.07.2024 15:29, Roger Pau Monné wrote:
> > On Mon, Jul 01, 2024 at 12:40:35PM +0200, Jan Beulich wrote:
> >> On 01.07.2024 11:55, Roger Pau Monné wrote:
> >>> On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
> >>>> --- a/xen/arch/x86/io_apic.c
> >>>> +++ b/xen/arch/x86/io_apic.c
> >>>> @@ -2663,18 +2663,21 @@ void __init ioapic_init(void)
> >>>>             nr_irqs_gsi, nr_irqs - nr_irqs_gsi);
> >>>>  }
> >>>>  
> >>>> -unsigned int arch_hwdom_irqs(domid_t domid)
> >>>> +unsigned int arch_hwdom_irqs(const struct domain *d)
> >>>
> >>> While at it, should this be __hwdom_init?
> >>
> >> It indeed can be, so I've done this for v4.
> >>
> >>> I'm fine with changing the function to take a domain parameter...
> >>>
> >>>>  {
> >>>>      unsigned int n = fls(num_present_cpus());
> >>>>  
> >>>> -    if ( !domid )
> >>>> +    if ( is_system_domain(d) )
> >>>> +        return PAGE_SIZE * BITS_PER_BYTE;
> >>>
> >>> ... but why do we need a function call just to get a constant value?
> >>> Wouldn't this better be a define in a header?
> >>
> >> Would be an option, but would result in parts of the logic living is
> >> distinct places.
> >>
> >>>> +
> >>>> +    if ( !d->domain_id )
> >>>>          n = min(n, dom0_max_vcpus());
> >>>>      n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
> >>>>  
> >>>>      /* Bounded by the domain pirq eoi bitmap gfn. */
> >>>>      n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);
> >>>
> >>> So that could also use the same constant here?
> > 
> > I would have a slight preference for PAGE_SIZE * BITS_PER_BYTE being
> > defined inside of this function as:
> > 
> > /* Bounded by the domain pirq eoi bitmap gfn. */
> > const unsigned int max_irqs = PAGE_SIZE * BITS_PER_BYTE;
> > 
> > Or similar for clarity purposes.
> 
> Can do, sure.
> 
> > While at it, I've noticed that PHYSDEVOP_pirq_eoi_gmfn_v{1,2} is not
> > available to HVM guests (not even when exposing PIRQ support) and
> > hence I wonder if we should special case PVH dom0, but maybe it's best
> > to deal with this properly rather than hacking something special
> > just for PVH dom0.  At the end of the day the current limit is high
> > enough to not cause issues on current systems I would expect.
> 
> Oh, so entirely the other way around than mentioned when we talked, once
> again due to the filtering in hvm/hypercall.h that I keep forgetting. So
> in principle we could avoid the bounding for HVM. Just that right now
> extra_domU_irqs covers both PV and HVM, and would hence need splitting
> first.

Yes, we would need to split, that's why I'm OK with what you propose
here.  We can do the split as a later change.

> >>>> --- a/xen/common/domain.c
> >>>> +++ b/xen/common/domain.c
> >>>> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
> >>>>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
> >>>>          else
> >>>>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
> >>>> -                                           : arch_hwdom_irqs(domid);
> >>>> +                                           : arch_hwdom_irqs(d);
> >>>>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
> >>>>  
> >>>>          radix_tree_init(&d->pirq_tree);
> >>>> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
> >>>>      if ( IS_ERR(dom_xen) )
> >>>>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
> >>>>  
> >>>> +#ifdef CONFIG_HAS_PIRQ
> >>>> +    /* Bound-check values passed via "extra_guest_irqs=". */
> >>>> +    {
> >>>> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
> >>>> +
> >>>> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
> >>>> +        {
> >>>> +            extra_hwdom_irqs = n - nr_static_irqs;
> >>>> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
> >>>> +        }
> >>>> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
> >>>> +        {
> >>>> +            extra_domU_irqs = n - nr_static_irqs;
> >>>> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
> >>>> +        }
> >>>> +    }
> >>>> +#endif
> >>>
> >>> IMO this is kind of a weird placement. Wouldn't this be more naturally
> >>> handled in parse_extra_guest_irqs()?
> >>
> >> Indeed it is and yes it would, but no, it can't. We shouldn't rely on
> >> the particular behavior of arch_hwdom_irqs(), and in the general case
> >> we can't call it as early as when command line arguments are parsed. I
> >> couldn't think of a neater way of doing this, and it not being pretty
> >> is why I'm saying "(ab)use" in the description.
> > 
> > I see, nr_static_irqs is an alias of nr_irqs_gsi, which is not properly
> > set by the time we evaluate command line arguments.
> > 
> > My only possible suggestion would be to do it as a presmp initcall,
> > and define/register such initcall for x86 only, the only benefit would
> > be that such inicall could be defined in the same translation unit as
> > arch_hwdom_irqs() then.
> 
> Which then would require making extra_{hwdom,domU}_irqs available to
> x86/io_apic.c, which also wouldn't be very nice. To be honest, I'd prefer
> to keep the logic where it is, until such time where perhaps we move pIRQ
> stuff wholesale to x86-only files.

Fine by me.

I think we are in agreement about what needs doing.  I can provide:

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

With the changes we have agreed to arch_hwdom_irqs().

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 for-4.19? 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-01 15:17           ` Roger Pau Monné
@ 2024-07-02  8:26             ` Jan Beulich
  2024-07-02  9:50               ` Oleksii
  2024-07-02  8:30             ` [PATCH v3 " Jan Beulich
  1 sibling, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2024-07-02  8:26 UTC (permalink / raw)
  To: Oleksii Kurochko
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu, Roger Pau Monné

On 01.07.2024 17:17, Roger Pau Monné wrote:
> I think we are in agreement about what needs doing.  I can provide:
> 
> Acked-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> With the changes we have agreed to arch_hwdom_irqs().

Yet another request for considering for a release-ack.

Thanks, Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-01 15:17           ` Roger Pau Monné
  2024-07-02  8:26             ` [PATCH v3 for-4.19? " Jan Beulich
@ 2024-07-02  8:30             ` Jan Beulich
  2024-07-02  8:45               ` Roger Pau Monné
  1 sibling, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2024-07-02  8:30 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On 01.07.2024 17:17, Roger Pau Monné wrote:
> On Mon, Jul 01, 2024 at 05:07:19PM +0200, Jan Beulich wrote:
>> On 01.07.2024 15:29, Roger Pau Monné wrote:
>>> On Mon, Jul 01, 2024 at 12:40:35PM +0200, Jan Beulich wrote:
>>>> On 01.07.2024 11:55, Roger Pau Monné wrote:
>>>>> On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
>>>>>> --- a/xen/common/domain.c
>>>>>> +++ b/xen/common/domain.c
>>>>>> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
>>>>>>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
>>>>>>          else
>>>>>>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
>>>>>> -                                           : arch_hwdom_irqs(domid);
>>>>>> +                                           : arch_hwdom_irqs(d);
>>>>>>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
>>>>>>  
>>>>>>          radix_tree_init(&d->pirq_tree);
>>>>>> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
>>>>>>      if ( IS_ERR(dom_xen) )
>>>>>>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
>>>>>>  
>>>>>> +#ifdef CONFIG_HAS_PIRQ
>>>>>> +    /* Bound-check values passed via "extra_guest_irqs=". */
>>>>>> +    {
>>>>>> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
>>>>>> +
>>>>>> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
>>>>>> +        {
>>>>>> +            extra_hwdom_irqs = n - nr_static_irqs;
>>>>>> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
>>>>>> +        }
>>>>>> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
>>>>>> +        {
>>>>>> +            extra_domU_irqs = n - nr_static_irqs;
>>>>>> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
>>>>>> +        }
>>>>>> +    }
>>>>>> +#endif
>>>>>
>>>>> IMO this is kind of a weird placement. Wouldn't this be more naturally
>>>>> handled in parse_extra_guest_irqs()?
>>>>
>>>> Indeed it is and yes it would, but no, it can't. We shouldn't rely on
>>>> the particular behavior of arch_hwdom_irqs(), and in the general case
>>>> we can't call it as early as when command line arguments are parsed. I
>>>> couldn't think of a neater way of doing this, and it not being pretty
>>>> is why I'm saying "(ab)use" in the description.
>>>
>>> I see, nr_static_irqs is an alias of nr_irqs_gsi, which is not properly
>>> set by the time we evaluate command line arguments.
>>>
>>> My only possible suggestion would be to do it as a presmp initcall,
>>> and define/register such initcall for x86 only, the only benefit would
>>> be that such inicall could be defined in the same translation unit as
>>> arch_hwdom_irqs() then.
>>
>> Which then would require making extra_{hwdom,domU}_irqs available to
>> x86/io_apic.c, which also wouldn't be very nice. To be honest, I'd prefer
>> to keep the logic where it is, until such time where perhaps we move pIRQ
>> stuff wholesale to x86-only files.
> 
> Fine by me.
> 
> I think we are in agreement about what needs doing.

Hmm, after further thinking I'm not sure splitting would be the way to go.
We should rather properly remove the concept of pIRQ from PVH, i.e. not
only not have them visible to the kernel, but also not use e.g. ->nr_pirqs
and ->pirq_tree internally. Then we could presumably drop the constraining
via this command line option (documenting it as inapplicable to PVH).

>  I can provide:
> 
> Acked-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> With the changes we have agreed to arch_hwdom_irqs().

Thanks; changes done locally.

Jan


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-02  8:30             ` [PATCH v3 " Jan Beulich
@ 2024-07-02  8:45               ` Roger Pau Monné
  0 siblings, 0 replies; 20+ messages in thread
From: Roger Pau Monné @ 2024-07-02  8:45 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu

On Tue, Jul 02, 2024 at 10:30:03AM +0200, Jan Beulich wrote:
> On 01.07.2024 17:17, Roger Pau Monné wrote:
> > On Mon, Jul 01, 2024 at 05:07:19PM +0200, Jan Beulich wrote:
> >> On 01.07.2024 15:29, Roger Pau Monné wrote:
> >>> On Mon, Jul 01, 2024 at 12:40:35PM +0200, Jan Beulich wrote:
> >>>> On 01.07.2024 11:55, Roger Pau Monné wrote:
> >>>>> On Thu, Jul 27, 2023 at 09:38:55AM +0200, Jan Beulich wrote:
> >>>>>> --- a/xen/common/domain.c
> >>>>>> +++ b/xen/common/domain.c
> >>>>>> @@ -693,7 +693,7 @@ struct domain *domain_create(domid_t dom
> >>>>>>              d->nr_pirqs = nr_static_irqs + extra_domU_irqs;
> >>>>>>          else
> >>>>>>              d->nr_pirqs = extra_hwdom_irqs ? nr_static_irqs + extra_hwdom_irqs
> >>>>>> -                                           : arch_hwdom_irqs(domid);
> >>>>>> +                                           : arch_hwdom_irqs(d);
> >>>>>>          d->nr_pirqs = min(d->nr_pirqs, nr_irqs);
> >>>>>>  
> >>>>>>          radix_tree_init(&d->pirq_tree);
> >>>>>> @@ -819,6 +819,24 @@ void __init setup_system_domains(void)
> >>>>>>      if ( IS_ERR(dom_xen) )
> >>>>>>          panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
> >>>>>>  
> >>>>>> +#ifdef CONFIG_HAS_PIRQ
> >>>>>> +    /* Bound-check values passed via "extra_guest_irqs=". */
> >>>>>> +    {
> >>>>>> +        unsigned int n = max(arch_hwdom_irqs(dom_xen), nr_static_irqs);
> >>>>>> +
> >>>>>> +        if ( extra_hwdom_irqs > n - nr_static_irqs )
> >>>>>> +        {
> >>>>>> +            extra_hwdom_irqs = n - nr_static_irqs;
> >>>>>> +            printk(XENLOG_WARNING "hwdom IRQs bounded to %u\n", n);
> >>>>>> +        }
> >>>>>> +        if ( extra_domU_irqs > max(32U, n - nr_static_irqs) )
> >>>>>> +        {
> >>>>>> +            extra_domU_irqs = n - nr_static_irqs;
> >>>>>> +            printk(XENLOG_WARNING "domU IRQs bounded to %u\n", n);
> >>>>>> +        }
> >>>>>> +    }
> >>>>>> +#endif
> >>>>>
> >>>>> IMO this is kind of a weird placement. Wouldn't this be more naturally
> >>>>> handled in parse_extra_guest_irqs()?
> >>>>
> >>>> Indeed it is and yes it would, but no, it can't. We shouldn't rely on
> >>>> the particular behavior of arch_hwdom_irqs(), and in the general case
> >>>> we can't call it as early as when command line arguments are parsed. I
> >>>> couldn't think of a neater way of doing this, and it not being pretty
> >>>> is why I'm saying "(ab)use" in the description.
> >>>
> >>> I see, nr_static_irqs is an alias of nr_irqs_gsi, which is not properly
> >>> set by the time we evaluate command line arguments.
> >>>
> >>> My only possible suggestion would be to do it as a presmp initcall,
> >>> and define/register such initcall for x86 only, the only benefit would
> >>> be that such inicall could be defined in the same translation unit as
> >>> arch_hwdom_irqs() then.
> >>
> >> Which then would require making extra_{hwdom,domU}_irqs available to
> >> x86/io_apic.c, which also wouldn't be very nice. To be honest, I'd prefer
> >> to keep the logic where it is, until such time where perhaps we move pIRQ
> >> stuff wholesale to x86-only files.
> > 
> > Fine by me.
> > 
> > I think we are in agreement about what needs doing.
> 
> Hmm, after further thinking I'm not sure splitting would be the way to go.
> We should rather properly remove the concept of pIRQ from PVH, i.e. not
> only not have them visible to the kernel, but also not use e.g. ->nr_pirqs
> and ->pirq_tree internally. Then we could presumably drop the constraining
> via this command line option (documenting it as inapplicable to PVH).

Removing it from PVH would also imply removing from HVM AFAICT (unless
it's HVM with explicitly pIRQ support).  I think the main issue there
would be dealing with the change in all the interfaces that currently
take pIRQ parameters, albeit I could be wrong.  Seems like a
worthwhile improvement.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 for-4.19? 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-02  8:26             ` [PATCH v3 for-4.19? " Jan Beulich
@ 2024-07-02  9:50               ` Oleksii
  2024-07-02  9:55                 ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Oleksii @ 2024-07-02  9:50 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu, Roger Pau Monné

On Tue, 2024-07-02 at 10:26 +0200, Jan Beulich wrote:
> On 01.07.2024 17:17, Roger Pau Monné wrote:
> > I think we are in agreement about what needs doing.  I can provide:
> > 
> > Acked-by: Roger Pau Monné <roger.pau@citrix.com>
> > 
> > With the changes we have agreed to arch_hwdom_irqs().
> 
> Yet another request for considering for a release-ack.
It doesn't fix any real issue, does it? It just provides limitation of
how many pIRQs can be used by domain.

Anyway this change is low-risk so:
 Release-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>

~ Oleksii


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 for-4.19? 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds
  2024-07-02  9:50               ` Oleksii
@ 2024-07-02  9:55                 ` Jan Beulich
  0 siblings, 0 replies; 20+ messages in thread
From: Jan Beulich @ 2024-07-02  9:55 UTC (permalink / raw)
  To: Oleksii
  Cc: xen-devel@lists.xenproject.org, Andrew Cooper, Julien Grall,
	Stefano Stabellini, Wei Liu, Roger Pau Monné

On 02.07.2024 11:50, Oleksii wrote:
> On Tue, 2024-07-02 at 10:26 +0200, Jan Beulich wrote:
>> On 01.07.2024 17:17, Roger Pau Monné wrote:
>>> I think we are in agreement about what needs doing.  I can provide:
>>>
>>> Acked-by: Roger Pau Monné <roger.pau@citrix.com>
>>>
>>> With the changes we have agreed to arch_hwdom_irqs().
>>
>> Yet another request for considering for a release-ack.
> It doesn't fix any real issue, does it?

It does address a bug report we had, by folks specifying excessively large
values with that command line option.

> It just provides limitation of how many pIRQs can be used by domain.
> 
> Anyway this change is low-risk so:
>  Release-Acked-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>

Thanks.

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2024-07-02  9:56 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-07-27  7:37 [PATCH v3 0/3] new CONFIG_HAS_PIRQ and extra_guest_irqs adjustment Jan Beulich
2023-07-27  7:38 ` [PATCH v3 1/3] restrict concept of pIRQ to x86 Jan Beulich
2024-03-12 22:54   ` Julien Grall
2023-07-27  7:38 ` [PATCH v3 2/3] pirq_cleanup_check() leaks Jan Beulich
2024-07-01  8:55   ` Roger Pau Monné
2024-07-01  9:47     ` Jan Beulich
2024-07-01 11:13       ` Roger Pau Monné
2024-07-01 13:21         ` [PATCH v3 for-4.19 " Jan Beulich
2024-07-01 15:05           ` Oleksii
2023-07-27  7:38 ` [PATCH v3 3/3] cmdline: document and enforce "extra_guest_irqs" upper bounds Jan Beulich
2024-07-01  9:55   ` Roger Pau Monné
2024-07-01 10:40     ` Jan Beulich
2024-07-01 13:29       ` Roger Pau Monné
2024-07-01 15:07         ` Jan Beulich
2024-07-01 15:17           ` Roger Pau Monné
2024-07-02  8:26             ` [PATCH v3 for-4.19? " Jan Beulich
2024-07-02  9:50               ` Oleksii
2024-07-02  9:55                 ` Jan Beulich
2024-07-02  8:30             ` [PATCH v3 " Jan Beulich
2024-07-02  8:45               ` Roger Pau Monné

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.