All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH, RFC] x86/HVM: batch vCPU wakeups
@ 2014-09-09  8:33 Jan Beulich
  2014-09-09  9:33 ` Pasi Kärkkäinen
  2014-09-09 21:29 ` Tim Deegan
  0 siblings, 2 replies; 10+ messages in thread
From: Jan Beulich @ 2014-09-09  8:33 UTC (permalink / raw)
  To: xen-devel; +Cc: Ian Campbell, Keir Fraser, Ian Jackson, Tim Deegan

[-- Attachment #1: Type: text/plain, Size: 6192 bytes --]

Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
especially when many of the remote pCPU-s are in deep C-states. For
64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
accumulated times of over 2ms were observed. Considering that Windows
broadcasts IPIs from its timer interrupt, which at least at certain
times can run at 1kHz, it is clear that this can result in good
guest behavior. In fact, on said hardware guests with significantly
beyond 40 vCPU-s simply hung when e.g. ServerManager gets started.
With the patch here, average broadcast times for a 64-vCPU guest went
to a measured maximum of 310us (which is still quite a lot).

This isn't just helping to reduce the number of ICR writes when the
host APICs run in clustered mode, but also to reduce them by
suppressing the sends altogether when - by the time
cpu_raise_softirq_batch_finish() is reached - the remote CPU already
managed to handle the softirq. Plus - when using MONITOR/MWAIT - the
update of softirq_pending(cpu), being on the monitored cache line -
should make the remote CPU wake up ahead of the ICR being sent,
allowing the wait-for-ICR-idle latencies to be reduced (to perhaps a
large part due to overlapping the wakeups of multiple CPUs).

Of course this necessarily increases the latencies for the remote
CPU wakeup at least slightly. To weigh between the effects, the
condition to enable batching in vlapic_ipi() may need further tuning.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC for two reasons:
1) I started to consider elimination of the event-check IPIs in a more
   general way when MONITOR/MWAIT is in use: As long as the remote CPU
   is known to be MWAITing (or in the process of resuming after MWAIT),
   the write of softirq_pending(cpu) ought to be sufficient to wake it.
   This would yield the patch here pointless on MONITOR/MWAIT capable
   hardware.
2) The condition when to enable batching in vlapic_ipi() is already
   rather complex, but it is nevertheless unclear whether it would
   benefit from further tuning (as mentioned above).

--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -447,12 +447,30 @@ void vlapic_ipi(
 
     default: {
         struct vcpu *v;
-        for_each_vcpu ( vlapic_domain(vlapic), v )
+        const struct domain *d = vlapic_domain(vlapic);
+        bool_t batch = d->max_vcpus > 2 &&
+                       (short_hand
+                        ? short_hand != APIC_DEST_SELF
+                        : vlapic_x2apic_mode(vlapic)
+                          ? dest_mode ? hweight16(dest) > 1
+                                      : dest == 0xffffffff
+                          : dest_mode
+                            ? hweight8(dest &
+                                       GET_xAPIC_DEST_FIELD(
+                                           vlapic_get_reg(vlapic,
+                                                          APIC_DFR))) > 1
+                            : dest == 0xff);
+
+        if ( batch )
+            cpu_raise_softirq_batch_begin();
+        for_each_vcpu ( d, v )
         {
             if ( vlapic_match_dest(vcpu_vlapic(v), vlapic,
                                    short_hand, dest, dest_mode) )
                 vlapic_accept_irq(v, icr_low);
         }
+        if ( batch )
+            cpu_raise_softirq_batch_finish();
         break;
     }
     }
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -23,6 +23,9 @@ irq_cpustat_t irq_stat[NR_CPUS];
 
 static softirq_handler softirq_handlers[NR_SOFTIRQS];
 
+static DEFINE_PER_CPU(cpumask_t, batch_mask);
+static DEFINE_PER_CPU(unsigned int, batching);
+
 static void __do_softirq(unsigned long ignore_mask)
 {
     unsigned int i, cpu;
@@ -70,22 +73,56 @@ void open_softirq(int nr, softirq_handle
 
 void cpumask_raise_softirq(const cpumask_t *mask, unsigned int nr)
 {
-    int cpu;
-    cpumask_t send_mask;
+    unsigned int cpu = smp_processor_id();
+    cpumask_t send_mask, *raise_mask;
+
+    if ( !per_cpu(batching, cpu) || in_irq() )
+    {
+        cpumask_clear(&send_mask);
+        raise_mask = &send_mask;
+    }
+    else
+        raise_mask = &per_cpu(batch_mask, cpu);
 
-    cpumask_clear(&send_mask);
     for_each_cpu(cpu, mask)
         if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
-            cpumask_set_cpu(cpu, &send_mask);
+            cpumask_set_cpu(cpu, raise_mask);
 
-    smp_send_event_check_mask(&send_mask);
+    if ( raise_mask == &send_mask )
+        smp_send_event_check_mask(raise_mask);
 }
 
 void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 {
-    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
-         && (cpu != smp_processor_id()) )
+    unsigned int this_cpu = smp_processor_id();
+
+    if ( test_and_set_bit(nr, &softirq_pending(cpu))
+         || (cpu == this_cpu) )
+        return;
+
+    if ( !per_cpu(batching, this_cpu) || in_irq() )
         smp_send_event_check_cpu(cpu);
+    else
+        set_bit(nr, &per_cpu(batch_mask, this_cpu));
+}
+
+void cpu_raise_softirq_batch_begin(void)
+{
+    ++per_cpu(batching, smp_processor_id());
+}
+
+void cpu_raise_softirq_batch_finish(void)
+{
+    unsigned int cpu, this_cpu = smp_processor_id();
+    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
+
+    ASSERT(per_cpu(batching, this_cpu));
+    for_each_cpu ( cpu, mask )
+        if ( !softirq_pending(cpu) )
+            cpumask_clear_cpu(cpu, mask);
+    smp_send_event_check_mask(mask);
+    cpumask_clear(mask);
+    --per_cpu(batching, this_cpu);
 }
 
 void raise_softirq(unsigned int nr)
--- a/xen/include/xen/softirq.h
+++ b/xen/include/xen/softirq.h
@@ -30,6 +30,9 @@ void cpumask_raise_softirq(const cpumask
 void cpu_raise_softirq(unsigned int cpu, unsigned int nr);
 void raise_softirq(unsigned int nr);
 
+void cpu_raise_softirq_batch_begin(void);
+void cpu_raise_softirq_batch_finish(void);
+
 /*
  * Process pending softirqs on this CPU. This should be called periodically
  * when performing work that prevents softirqs from running in a timely manner.



[-- Attachment #2: vcpu-wake-batch.patch --]
[-- Type: text/plain, Size: 6219 bytes --]

x86/HVM: batch vCPU wakeups

Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
especially when many of the remote pCPU-s are in deep C-states. For
64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
accumulated times of over 2ms were observed. Considering that Windows
broadcasts IPIs from its timer interrupt, which at least at certain
times can run at 1kHz, it is clear that this can result in good
guest behavior. In fact, on said hardware guests with significantly
beyond 40 vCPU-s simply hung when e.g. ServerManager gets started.
With the patch here, average broadcast times for a 64-vCPU guest went
to a measured maximum of 310us (which is still quite a lot).

This isn't just helping to reduce the number of ICR writes when the
host APICs run in clustered mode, but also to reduce them by
suppressing the sends altogether when - by the time
cpu_raise_softirq_batch_finish() is reached - the remote CPU already
managed to handle the softirq. Plus - when using MONITOR/MWAIT - the
update of softirq_pending(cpu), being on the monitored cache line -
should make the remote CPU wake up ahead of the ICR being sent,
allowing the wait-for-ICR-idle latencies to be reduced (to perhaps a
large part due to overlapping the wakeups of multiple CPUs).

Of course this necessarily increases the latencies for the remote
CPU wakeup at least slightly. To weigh between the effects, the
condition to enable batching in vlapic_ipi() may need further tuning.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC for two reasons:
1) I started to consider elimination of the event-check IPIs in a more
   general way when MONITOR/MWAIT is in use: As long as the remote CPU
   is known to be MWAITing (or in the process of resuming after MWAIT),
   the write of softirq_pending(cpu) ought to be sufficient to wake it.
   This would yield the patch here pointless on MONITOR/MWAIT capable
   hardware.
2) The condition when to enable batching in vlapic_ipi() is already
   rather complex, but it is nevertheless unclear whether it would
   benefit from further tuning (as mentioned above).

--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -447,12 +447,30 @@ void vlapic_ipi(
 
     default: {
         struct vcpu *v;
-        for_each_vcpu ( vlapic_domain(vlapic), v )
+        const struct domain *d = vlapic_domain(vlapic);
+        bool_t batch = d->max_vcpus > 2 &&
+                       (short_hand
+                        ? short_hand != APIC_DEST_SELF
+                        : vlapic_x2apic_mode(vlapic)
+                          ? dest_mode ? hweight16(dest) > 1
+                                      : dest == 0xffffffff
+                          : dest_mode
+                            ? hweight8(dest &
+                                       GET_xAPIC_DEST_FIELD(
+                                           vlapic_get_reg(vlapic,
+                                                          APIC_DFR))) > 1
+                            : dest == 0xff);
+
+        if ( batch )
+            cpu_raise_softirq_batch_begin();
+        for_each_vcpu ( d, v )
         {
             if ( vlapic_match_dest(vcpu_vlapic(v), vlapic,
                                    short_hand, dest, dest_mode) )
                 vlapic_accept_irq(v, icr_low);
         }
+        if ( batch )
+            cpu_raise_softirq_batch_finish();
         break;
     }
     }
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -23,6 +23,9 @@ irq_cpustat_t irq_stat[NR_CPUS];
 
 static softirq_handler softirq_handlers[NR_SOFTIRQS];
 
+static DEFINE_PER_CPU(cpumask_t, batch_mask);
+static DEFINE_PER_CPU(unsigned int, batching);
+
 static void __do_softirq(unsigned long ignore_mask)
 {
     unsigned int i, cpu;
@@ -70,22 +73,56 @@ void open_softirq(int nr, softirq_handle
 
 void cpumask_raise_softirq(const cpumask_t *mask, unsigned int nr)
 {
-    int cpu;
-    cpumask_t send_mask;
+    unsigned int cpu = smp_processor_id();
+    cpumask_t send_mask, *raise_mask;
+
+    if ( !per_cpu(batching, cpu) || in_irq() )
+    {
+        cpumask_clear(&send_mask);
+        raise_mask = &send_mask;
+    }
+    else
+        raise_mask = &per_cpu(batch_mask, cpu);
 
-    cpumask_clear(&send_mask);
     for_each_cpu(cpu, mask)
         if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
-            cpumask_set_cpu(cpu, &send_mask);
+            cpumask_set_cpu(cpu, raise_mask);
 
-    smp_send_event_check_mask(&send_mask);
+    if ( raise_mask == &send_mask )
+        smp_send_event_check_mask(raise_mask);
 }
 
 void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 {
-    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
-         && (cpu != smp_processor_id()) )
+    unsigned int this_cpu = smp_processor_id();
+
+    if ( test_and_set_bit(nr, &softirq_pending(cpu))
+         || (cpu == this_cpu) )
+        return;
+
+    if ( !per_cpu(batching, this_cpu) || in_irq() )
         smp_send_event_check_cpu(cpu);
+    else
+        set_bit(nr, &per_cpu(batch_mask, this_cpu));
+}
+
+void cpu_raise_softirq_batch_begin(void)
+{
+    ++per_cpu(batching, smp_processor_id());
+}
+
+void cpu_raise_softirq_batch_finish(void)
+{
+    unsigned int cpu, this_cpu = smp_processor_id();
+    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
+
+    ASSERT(per_cpu(batching, this_cpu));
+    for_each_cpu ( cpu, mask )
+        if ( !softirq_pending(cpu) )
+            cpumask_clear_cpu(cpu, mask);
+    smp_send_event_check_mask(mask);
+    cpumask_clear(mask);
+    --per_cpu(batching, this_cpu);
 }
 
 void raise_softirq(unsigned int nr)
--- a/xen/include/xen/softirq.h
+++ b/xen/include/xen/softirq.h
@@ -30,6 +30,9 @@ void cpumask_raise_softirq(const cpumask
 void cpu_raise_softirq(unsigned int cpu, unsigned int nr);
 void raise_softirq(unsigned int nr);
 
+void cpu_raise_softirq_batch_begin(void);
+void cpu_raise_softirq_batch_finish(void);
+
 /*
  * Process pending softirqs on this CPU. This should be called periodically
  * when performing work that prevents softirqs from running in a timely manner.

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09  8:33 [PATCH, RFC] x86/HVM: batch vCPU wakeups Jan Beulich
@ 2014-09-09  9:33 ` Pasi Kärkkäinen
  2014-09-09  9:54   ` Jan Beulich
  2014-09-09 21:29 ` Tim Deegan
  1 sibling, 1 reply; 10+ messages in thread
From: Pasi Kärkkäinen @ 2014-09-09  9:33 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson, Tim Deegan

On Tue, Sep 09, 2014 at 09:33:37AM +0100, Jan Beulich wrote:
> Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
> especially when many of the remote pCPU-s are in deep C-states. For
> 64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
> accumulated times of over 2ms were observed. Considering that Windows
> broadcasts IPIs from its timer interrupt, which at least at certain
> times can run at 1kHz, it is clear that this can result in good
> guest behavior.

I guess that should say "it is clear that this *can't* result in good guest behaviour" .. 


-- Pasi


> In fact, on said hardware guests with significantly
> beyond 40 vCPU-s simply hung when e.g. ServerManager gets started.
> With the patch here, average broadcast times for a 64-vCPU guest went
> to a measured maximum of 310us (which is still quite a lot).
> 
> This isn't just helping to reduce the number of ICR writes when the
> host APICs run in clustered mode, but also to reduce them by
> suppressing the sends altogether when - by the time
> cpu_raise_softirq_batch_finish() is reached - the remote CPU already
> managed to handle the softirq. Plus - when using MONITOR/MWAIT - the
> update of softirq_pending(cpu), being on the monitored cache line -
> should make the remote CPU wake up ahead of the ICR being sent,
> allowing the wait-for-ICR-idle latencies to be reduced (to perhaps a
> large part due to overlapping the wakeups of multiple CPUs).
> 
> Of course this necessarily increases the latencies for the remote
> CPU wakeup at least slightly. To weigh between the effects, the
> condition to enable batching in vlapic_ipi() may need further tuning.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC for two reasons:
> 1) I started to consider elimination of the event-check IPIs in a more
>    general way when MONITOR/MWAIT is in use: As long as the remote CPU
>    is known to be MWAITing (or in the process of resuming after MWAIT),
>    the write of softirq_pending(cpu) ought to be sufficient to wake it.
>    This would yield the patch here pointless on MONITOR/MWAIT capable
>    hardware.
> 2) The condition when to enable batching in vlapic_ipi() is already
>    rather complex, but it is nevertheless unclear whether it would
>    benefit from further tuning (as mentioned above).
> 
> --- a/xen/arch/x86/hvm/vlapic.c
> +++ b/xen/arch/x86/hvm/vlapic.c
> @@ -447,12 +447,30 @@ void vlapic_ipi(
>  
>      default: {
>          struct vcpu *v;
> -        for_each_vcpu ( vlapic_domain(vlapic), v )
> +        const struct domain *d = vlapic_domain(vlapic);
> +        bool_t batch = d->max_vcpus > 2 &&
> +                       (short_hand
> +                        ? short_hand != APIC_DEST_SELF
> +                        : vlapic_x2apic_mode(vlapic)
> +                          ? dest_mode ? hweight16(dest) > 1
> +                                      : dest == 0xffffffff
> +                          : dest_mode
> +                            ? hweight8(dest &
> +                                       GET_xAPIC_DEST_FIELD(
> +                                           vlapic_get_reg(vlapic,
> +                                                          APIC_DFR))) > 1
> +                            : dest == 0xff);
> +
> +        if ( batch )
> +            cpu_raise_softirq_batch_begin();
> +        for_each_vcpu ( d, v )
>          {
>              if ( vlapic_match_dest(vcpu_vlapic(v), vlapic,
>                                     short_hand, dest, dest_mode) )
>                  vlapic_accept_irq(v, icr_low);
>          }
> +        if ( batch )
> +            cpu_raise_softirq_batch_finish();
>          break;
>      }
>      }
> --- a/xen/common/softirq.c
> +++ b/xen/common/softirq.c
> @@ -23,6 +23,9 @@ irq_cpustat_t irq_stat[NR_CPUS];
>  
>  static softirq_handler softirq_handlers[NR_SOFTIRQS];
>  
> +static DEFINE_PER_CPU(cpumask_t, batch_mask);
> +static DEFINE_PER_CPU(unsigned int, batching);
> +
>  static void __do_softirq(unsigned long ignore_mask)
>  {
>      unsigned int i, cpu;
> @@ -70,22 +73,56 @@ void open_softirq(int nr, softirq_handle
>  
>  void cpumask_raise_softirq(const cpumask_t *mask, unsigned int nr)
>  {
> -    int cpu;
> -    cpumask_t send_mask;
> +    unsigned int cpu = smp_processor_id();
> +    cpumask_t send_mask, *raise_mask;
> +
> +    if ( !per_cpu(batching, cpu) || in_irq() )
> +    {
> +        cpumask_clear(&send_mask);
> +        raise_mask = &send_mask;
> +    }
> +    else
> +        raise_mask = &per_cpu(batch_mask, cpu);
>  
> -    cpumask_clear(&send_mask);
>      for_each_cpu(cpu, mask)
>          if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
> -            cpumask_set_cpu(cpu, &send_mask);
> +            cpumask_set_cpu(cpu, raise_mask);
>  
> -    smp_send_event_check_mask(&send_mask);
> +    if ( raise_mask == &send_mask )
> +        smp_send_event_check_mask(raise_mask);
>  }
>  
>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
>  {
> -    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
> -         && (cpu != smp_processor_id()) )
> +    unsigned int this_cpu = smp_processor_id();
> +
> +    if ( test_and_set_bit(nr, &softirq_pending(cpu))
> +         || (cpu == this_cpu) )
> +        return;
> +
> +    if ( !per_cpu(batching, this_cpu) || in_irq() )
>          smp_send_event_check_cpu(cpu);
> +    else
> +        set_bit(nr, &per_cpu(batch_mask, this_cpu));
> +}
> +
> +void cpu_raise_softirq_batch_begin(void)
> +{
> +    ++per_cpu(batching, smp_processor_id());
> +}
> +
> +void cpu_raise_softirq_batch_finish(void)
> +{
> +    unsigned int cpu, this_cpu = smp_processor_id();
> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
> +
> +    ASSERT(per_cpu(batching, this_cpu));
> +    for_each_cpu ( cpu, mask )
> +        if ( !softirq_pending(cpu) )
> +            cpumask_clear_cpu(cpu, mask);
> +    smp_send_event_check_mask(mask);
> +    cpumask_clear(mask);
> +    --per_cpu(batching, this_cpu);
>  }
>  
>  void raise_softirq(unsigned int nr)
> --- a/xen/include/xen/softirq.h
> +++ b/xen/include/xen/softirq.h
> @@ -30,6 +30,9 @@ void cpumask_raise_softirq(const cpumask
>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr);
>  void raise_softirq(unsigned int nr);
>  
> +void cpu_raise_softirq_batch_begin(void);
> +void cpu_raise_softirq_batch_finish(void);
> +
>  /*
>   * Process pending softirqs on this CPU. This should be called periodically
>   * when performing work that prevents softirqs from running in a timely manner.
> 
> 

> x86/HVM: batch vCPU wakeups
> 
> Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
> especially when many of the remote pCPU-s are in deep C-states. For
> 64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
> accumulated times of over 2ms were observed. Considering that Windows
> broadcasts IPIs from its timer interrupt, which at least at certain
> times can run at 1kHz, it is clear that this can result in good
> guest behavior. In fact, on said hardware guests with significantly
> beyond 40 vCPU-s simply hung when e.g. ServerManager gets started.
> With the patch here, average broadcast times for a 64-vCPU guest went
> to a measured maximum of 310us (which is still quite a lot).
> 
> This isn't just helping to reduce the number of ICR writes when the
> host APICs run in clustered mode, but also to reduce them by
> suppressing the sends altogether when - by the time
> cpu_raise_softirq_batch_finish() is reached - the remote CPU already
> managed to handle the softirq. Plus - when using MONITOR/MWAIT - the
> update of softirq_pending(cpu), being on the monitored cache line -
> should make the remote CPU wake up ahead of the ICR being sent,
> allowing the wait-for-ICR-idle latencies to be reduced (to perhaps a
> large part due to overlapping the wakeups of multiple CPUs).
> 
> Of course this necessarily increases the latencies for the remote
> CPU wakeup at least slightly. To weigh between the effects, the
> condition to enable batching in vlapic_ipi() may need further tuning.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC for two reasons:
> 1) I started to consider elimination of the event-check IPIs in a more
>    general way when MONITOR/MWAIT is in use: As long as the remote CPU
>    is known to be MWAITing (or in the process of resuming after MWAIT),
>    the write of softirq_pending(cpu) ought to be sufficient to wake it.
>    This would yield the patch here pointless on MONITOR/MWAIT capable
>    hardware.
> 2) The condition when to enable batching in vlapic_ipi() is already
>    rather complex, but it is nevertheless unclear whether it would
>    benefit from further tuning (as mentioned above).
> 
> --- a/xen/arch/x86/hvm/vlapic.c
> +++ b/xen/arch/x86/hvm/vlapic.c
> @@ -447,12 +447,30 @@ void vlapic_ipi(
>  
>      default: {
>          struct vcpu *v;
> -        for_each_vcpu ( vlapic_domain(vlapic), v )
> +        const struct domain *d = vlapic_domain(vlapic);
> +        bool_t batch = d->max_vcpus > 2 &&
> +                       (short_hand
> +                        ? short_hand != APIC_DEST_SELF
> +                        : vlapic_x2apic_mode(vlapic)
> +                          ? dest_mode ? hweight16(dest) > 1
> +                                      : dest == 0xffffffff
> +                          : dest_mode
> +                            ? hweight8(dest &
> +                                       GET_xAPIC_DEST_FIELD(
> +                                           vlapic_get_reg(vlapic,
> +                                                          APIC_DFR))) > 1
> +                            : dest == 0xff);
> +
> +        if ( batch )
> +            cpu_raise_softirq_batch_begin();
> +        for_each_vcpu ( d, v )
>          {
>              if ( vlapic_match_dest(vcpu_vlapic(v), vlapic,
>                                     short_hand, dest, dest_mode) )
>                  vlapic_accept_irq(v, icr_low);
>          }
> +        if ( batch )
> +            cpu_raise_softirq_batch_finish();
>          break;
>      }
>      }
> --- a/xen/common/softirq.c
> +++ b/xen/common/softirq.c
> @@ -23,6 +23,9 @@ irq_cpustat_t irq_stat[NR_CPUS];
>  
>  static softirq_handler softirq_handlers[NR_SOFTIRQS];
>  
> +static DEFINE_PER_CPU(cpumask_t, batch_mask);
> +static DEFINE_PER_CPU(unsigned int, batching);
> +
>  static void __do_softirq(unsigned long ignore_mask)
>  {
>      unsigned int i, cpu;
> @@ -70,22 +73,56 @@ void open_softirq(int nr, softirq_handle
>  
>  void cpumask_raise_softirq(const cpumask_t *mask, unsigned int nr)
>  {
> -    int cpu;
> -    cpumask_t send_mask;
> +    unsigned int cpu = smp_processor_id();
> +    cpumask_t send_mask, *raise_mask;
> +
> +    if ( !per_cpu(batching, cpu) || in_irq() )
> +    {
> +        cpumask_clear(&send_mask);
> +        raise_mask = &send_mask;
> +    }
> +    else
> +        raise_mask = &per_cpu(batch_mask, cpu);
>  
> -    cpumask_clear(&send_mask);
>      for_each_cpu(cpu, mask)
>          if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
> -            cpumask_set_cpu(cpu, &send_mask);
> +            cpumask_set_cpu(cpu, raise_mask);
>  
> -    smp_send_event_check_mask(&send_mask);
> +    if ( raise_mask == &send_mask )
> +        smp_send_event_check_mask(raise_mask);
>  }
>  
>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
>  {
> -    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
> -         && (cpu != smp_processor_id()) )
> +    unsigned int this_cpu = smp_processor_id();
> +
> +    if ( test_and_set_bit(nr, &softirq_pending(cpu))
> +         || (cpu == this_cpu) )
> +        return;
> +
> +    if ( !per_cpu(batching, this_cpu) || in_irq() )
>          smp_send_event_check_cpu(cpu);
> +    else
> +        set_bit(nr, &per_cpu(batch_mask, this_cpu));
> +}
> +
> +void cpu_raise_softirq_batch_begin(void)
> +{
> +    ++per_cpu(batching, smp_processor_id());
> +}
> +
> +void cpu_raise_softirq_batch_finish(void)
> +{
> +    unsigned int cpu, this_cpu = smp_processor_id();
> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
> +
> +    ASSERT(per_cpu(batching, this_cpu));
> +    for_each_cpu ( cpu, mask )
> +        if ( !softirq_pending(cpu) )
> +            cpumask_clear_cpu(cpu, mask);
> +    smp_send_event_check_mask(mask);
> +    cpumask_clear(mask);
> +    --per_cpu(batching, this_cpu);
>  }
>  
>  void raise_softirq(unsigned int nr)
> --- a/xen/include/xen/softirq.h
> +++ b/xen/include/xen/softirq.h
> @@ -30,6 +30,9 @@ void cpumask_raise_softirq(const cpumask
>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr);
>  void raise_softirq(unsigned int nr);
>  
> +void cpu_raise_softirq_batch_begin(void);
> +void cpu_raise_softirq_batch_finish(void);
> +
>  /*
>   * Process pending softirqs on this CPU. This should be called periodically
>   * when performing work that prevents softirqs from running in a timely manner.

> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09  9:33 ` Pasi Kärkkäinen
@ 2014-09-09  9:54   ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2014-09-09  9:54 UTC (permalink / raw)
  To: Pasi Kärkkäinen
  Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson, Tim Deegan

>>> On 09.09.14 at 11:33, <pasik@iki.fi> wrote:
> On Tue, Sep 09, 2014 at 09:33:37AM +0100, Jan Beulich wrote:
>> Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
>> especially when many of the remote pCPU-s are in deep C-states. For
>> 64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
>> accumulated times of over 2ms were observed. Considering that Windows
>> broadcasts IPIs from its timer interrupt, which at least at certain
>> times can run at 1kHz, it is clear that this can result in good
>> guest behavior.
> 
> I guess that should say "it is clear that this *can't* result in good guest 
> behaviour" .. 

Of course - thanks for spotting.

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09  8:33 [PATCH, RFC] x86/HVM: batch vCPU wakeups Jan Beulich
  2014-09-09  9:33 ` Pasi Kärkkäinen
@ 2014-09-09 21:29 ` Tim Deegan
  2014-09-09 22:37   ` Andrew Cooper
                     ` (2 more replies)
  1 sibling, 3 replies; 10+ messages in thread
From: Tim Deegan @ 2014-09-09 21:29 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson

At 09:33 +0100 on 09 Sep (1410251617), Jan Beulich wrote:
> Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
> especially when many of the remote pCPU-s are in deep C-states. For
> 64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
> accumulated times of over 2ms were observed. Considering that Windows
> broadcasts IPIs from its timer interrupt

Bleargh. :(

> RFC for two reasons:
> 1) I started to consider elimination of the event-check IPIs in a more
>    general way when MONITOR/MWAIT is in use: As long as the remote CPU
>    is known to be MWAITing (or in the process of resuming after MWAIT),
>    the write of softirq_pending(cpu) ought to be sufficient to wake it.
>    This would yield the patch here pointless on MONITOR/MWAIT capable
>    hardware.

That's a promising line.   I guess the number of 64-way systems that
don't support MONITOR/MWAIT is pretty small. :)

> 2) The condition when to enable batching in vlapic_ipi() is already
>    rather complex, but it is nevertheless unclear whether it would
>    benefit from further tuning (as mentioned above).

The test you chose seems plausible enough to me.  I suppose we care
enough about single-destination IPIs on idle hardware that we need
this check at all; but I doubt that IPIs to exactly two destinations,
for example, are worth worrying about.

The implementation looks OK in general; a few nits below.

> --- a/xen/arch/x86/hvm/vlapic.c
> +++ b/xen/arch/x86/hvm/vlapic.c
> @@ -447,12 +447,30 @@ void vlapic_ipi(
>  
>      default: {
>          struct vcpu *v;
> -        for_each_vcpu ( vlapic_domain(vlapic), v )
> +        const struct domain *d = vlapic_domain(vlapic);
> +        bool_t batch = d->max_vcpus > 2 &&
> +                       (short_hand
> +                        ? short_hand != APIC_DEST_SELF
> +                        : vlapic_x2apic_mode(vlapic)
> +                          ? dest_mode ? hweight16(dest) > 1
> +                                      : dest == 0xffffffff
> +                          : dest_mode
> +                            ? hweight8(dest &
> +                                       GET_xAPIC_DEST_FIELD(
> +                                           vlapic_get_reg(vlapic,
> +                                                          APIC_DFR))) > 1
> +                            : dest == 0xff);

Yoiks.  Could you extract some of this into a helper function, say
is_unicast_dest(), just for readability?

>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
>  {
> -    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
> -         && (cpu != smp_processor_id()) )
> +    unsigned int this_cpu = smp_processor_id();

The name clashes with the this_cpu() macro (obviously OK for compiling
but harder to read) and can be dropped since...

> +
> +    if ( test_and_set_bit(nr, &softirq_pending(cpu))
> +         || (cpu == this_cpu) )
> +        return;
> +
> +    if ( !per_cpu(batching, this_cpu) || in_irq() )

this_cpu(batching) is the preferred way of addressing this on Xen
anyway - though I guess maybe it's _slightly_ less efficient to
recalculate get_cpu_info() here than to indirect through the
per_cpu_offsets[]?  I haven't looked at the asm.

>          smp_send_event_check_cpu(cpu);
> +    else
> +        set_bit(nr, &per_cpu(batch_mask, this_cpu));
> +}
> +
> +void cpu_raise_softirq_batch_begin(void)
> +{
> +    ++per_cpu(batching, smp_processor_id());

This one should definitely be using this_cpu().

> +}
> +
> +void cpu_raise_softirq_batch_finish(void)
> +{
> +    unsigned int cpu, this_cpu = smp_processor_id();
> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);

Again, this_cpu()?

> +    ASSERT(per_cpu(batching, this_cpu));
> +    for_each_cpu ( cpu, mask )
> +        if ( !softirq_pending(cpu) )
> +            cpumask_clear_cpu(cpu, mask);
> +    smp_send_event_check_mask(mask);
> +    cpumask_clear(mask);
> +    --per_cpu(batching, this_cpu);
>  }

One other thing that occurs to me is that we might do the batching in
the caller (by gathering a mask of pcpus in the vlapic code) but that
sounds messy.  And it's not like softirq is so particularly
latency-sensitive that we'd want to avoid this much overhead in the
common case.  So on second thoughts this way is better. :)

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09 21:29 ` Tim Deegan
@ 2014-09-09 22:37   ` Andrew Cooper
  2014-09-10 10:29     ` Tim Deegan
  2014-09-10  9:28   ` Jan Beulich
  2014-09-10 13:10   ` Jan Beulich
  2 siblings, 1 reply; 10+ messages in thread
From: Andrew Cooper @ 2014-09-09 22:37 UTC (permalink / raw)
  To: Tim Deegan, Jan Beulich; +Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson

On 09/09/2014 22:29, Tim Deegan wrote:
> At 09:33 +0100 on 09 Sep (1410251617), Jan Beulich wrote:
>> Mass wakeups (via vlapic_ipi()) can take enormous amounts of time,
>> especially when many of the remote pCPU-s are in deep C-states. For
>> 64-vCPU Windows Server 2012 R2 guests on Ivybridge hardware,
>> accumulated times of over 2ms were observed. Considering that Windows
>> broadcasts IPIs from its timer interrupt
> Bleargh. :(
>
>> RFC for two reasons:
>> 1) I started to consider elimination of the event-check IPIs in a more
>>    general way when MONITOR/MWAIT is in use: As long as the remote CPU
>>    is known to be MWAITing (or in the process of resuming after MWAIT),
>>    the write of softirq_pending(cpu) ought to be sufficient to wake it.
>>    This would yield the patch here pointless on MONITOR/MWAIT capable
>>    hardware.
> That's a promising line.   I guess the number of 64-way systems that
> don't support MONITOR/MWAIT is pretty small. :)
>
>> 2) The condition when to enable batching in vlapic_ipi() is already
>>    rather complex, but it is nevertheless unclear whether it would
>>    benefit from further tuning (as mentioned above).
> The test you chose seems plausible enough to me.  I suppose we care
> enough about single-destination IPIs on idle hardware that we need
> this check at all; but I doubt that IPIs to exactly two destinations,
> for example, are worth worrying about.
>
> The implementation looks OK in general; a few nits below.
>
>> --- a/xen/arch/x86/hvm/vlapic.c
>> +++ b/xen/arch/x86/hvm/vlapic.c
>> @@ -447,12 +447,30 @@ void vlapic_ipi(
>>  
>>      default: {
>>          struct vcpu *v;
>> -        for_each_vcpu ( vlapic_domain(vlapic), v )
>> +        const struct domain *d = vlapic_domain(vlapic);
>> +        bool_t batch = d->max_vcpus > 2 &&
>> +                       (short_hand
>> +                        ? short_hand != APIC_DEST_SELF
>> +                        : vlapic_x2apic_mode(vlapic)
>> +                          ? dest_mode ? hweight16(dest) > 1
>> +                                      : dest == 0xffffffff
>> +                          : dest_mode
>> +                            ? hweight8(dest &
>> +                                       GET_xAPIC_DEST_FIELD(
>> +                                           vlapic_get_reg(vlapic,
>> +                                                          APIC_DFR))) > 1
>> +                            : dest == 0xff);
> Yoiks.  Could you extract some of this into a helper function, say
> is_unicast_dest(), just for readability?
>
>>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
>>  {
>> -    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
>> -         && (cpu != smp_processor_id()) )
>> +    unsigned int this_cpu = smp_processor_id();
> The name clashes with the this_cpu() macro (obviously OK for compiling
> but harder to read) and can be dropped since...
>
>> +
>> +    if ( test_and_set_bit(nr, &softirq_pending(cpu))
>> +         || (cpu == this_cpu) )
>> +        return;
>> +
>> +    if ( !per_cpu(batching, this_cpu) || in_irq() )
> this_cpu(batching) is the preferred way of addressing this on Xen
> anyway - though I guess maybe it's _slightly_ less efficient to
> recalculate get_cpu_info() here than to indirect through the
> per_cpu_offsets[]?  I haven't looked at the asm.

It depends whether the compiler decides to do any stack manipulation. 
Certainly with the older implementation, if gcc leaves the stack pointer
alone, it will coalesce calls to get_cpu_info(), but will recall it if
any pushes/pops happen between basic blocks.  I believe my newer
implementation is coalesced across small stack changes, but it still
recalculated more than I would expect in some of the later functions. 
(Guess who spent a long time looking at generated asm while working on
that patch)

>
>>          smp_send_event_check_cpu(cpu);
>> +    else
>> +        set_bit(nr, &per_cpu(batch_mask, this_cpu));
>> +}
>> +
>> +void cpu_raise_softirq_batch_begin(void)
>> +{
>> +    ++per_cpu(batching, smp_processor_id());
> This one should definitely be using this_cpu().

Agreed in this case...

>
>> +}
>> +
>> +void cpu_raise_softirq_batch_finish(void)
>> +{
>> +    unsigned int cpu, this_cpu = smp_processor_id();
>> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
> Again, this_cpu()?

...But disagree here.  Multiple uses of this_cpu($FOO) cannot be
coalesced due to RELOC_HIDE() deliberately preventing optimisation.  For
multiple uses, pulling it out by pointer to start with results in rather
more efficient code.

~Andrew

>
>> +    ASSERT(per_cpu(batching, this_cpu));
>> +    for_each_cpu ( cpu, mask )
>> +        if ( !softirq_pending(cpu) )
>> +            cpumask_clear_cpu(cpu, mask);
>> +    smp_send_event_check_mask(mask);
>> +    cpumask_clear(mask);
>> +    --per_cpu(batching, this_cpu);
>>  }
> One other thing that occurs to me is that we might do the batching in
> the caller (by gathering a mask of pcpus in the vlapic code) but that
> sounds messy.  And it's not like softirq is so particularly
> latency-sensitive that we'd want to avoid this much overhead in the
> common case.  So on second thoughts this way is better. :)
>
> Cheers,
>
> Tim.
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09 21:29 ` Tim Deegan
  2014-09-09 22:37   ` Andrew Cooper
@ 2014-09-10  9:28   ` Jan Beulich
  2014-09-10 13:10   ` Jan Beulich
  2 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2014-09-10  9:28 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson

>>> On 09.09.14 at 23:29, <tim@xen.org> wrote:
> At 09:33 +0100 on 09 Sep (1410251617), Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/vlapic.c
>> +++ b/xen/arch/x86/hvm/vlapic.c
>> @@ -447,12 +447,30 @@ void vlapic_ipi(
>>  
>>      default: {
>>          struct vcpu *v;
>> -        for_each_vcpu ( vlapic_domain(vlapic), v )
>> +        const struct domain *d = vlapic_domain(vlapic);
>> +        bool_t batch = d->max_vcpus > 2 &&
>> +                       (short_hand
>> +                        ? short_hand != APIC_DEST_SELF
>> +                        : vlapic_x2apic_mode(vlapic)
>> +                          ? dest_mode ? hweight16(dest) > 1
>> +                                      : dest == 0xffffffff
>> +                          : dest_mode
>> +                            ? hweight8(dest &
>> +                                       GET_xAPIC_DEST_FIELD(
>> +                                           vlapic_get_reg(vlapic,
>> +                                                          APIC_DFR))) > 1
>> +                            : dest == 0xff);
> 
> Yoiks.  Could you extract some of this into a helper function, say
> is_unicast_dest(), just for readability?

Sure.

>>  void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
>>  {
>> -    if ( !test_and_set_bit(nr, &softirq_pending(cpu))
>> -         && (cpu != smp_processor_id()) )
>> +    unsigned int this_cpu = smp_processor_id();
> 
> The name clashes with the this_cpu() macro (obviously OK for compiling
> but harder to read) and can be dropped since...
> 
>> +
>> +    if ( test_and_set_bit(nr, &softirq_pending(cpu))
>> +         || (cpu == this_cpu) )
>> +        return;
>> +
>> +    if ( !per_cpu(batching, this_cpu) || in_irq() )
> 
> this_cpu(batching) is the preferred way of addressing this on Xen
> anyway - though I guess maybe it's _slightly_ less efficient to
> recalculate get_cpu_info() here than to indirect through the
> per_cpu_offsets[]?  I haven't looked at the asm.

Right - multiple uses of this_cpu() in the same function are indeed
less efficient than latching smp_processor_id() into a local variable.

>>          smp_send_event_check_cpu(cpu);
>> +    else
>> +        set_bit(nr, &per_cpu(batch_mask, this_cpu));
>> +}
>> +
>> +void cpu_raise_softirq_batch_begin(void)
>> +{
>> +    ++per_cpu(batching, smp_processor_id());
> 
> This one should definitely be using this_cpu().

Absolutely - unintended copy-and-paste effect.

>> +}
>> +
>> +void cpu_raise_softirq_batch_finish(void)
>> +{
>> +    unsigned int cpu, this_cpu = smp_processor_id();
>> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
> 
> Again, this_cpu()?

No, as again the latched local variable yields better code.

>> +    ASSERT(per_cpu(batching, this_cpu));
>> +    for_each_cpu ( cpu, mask )
>> +        if ( !softirq_pending(cpu) )
>> +            cpumask_clear_cpu(cpu, mask);
>> +    smp_send_event_check_mask(mask);
>> +    cpumask_clear(mask);
>> +    --per_cpu(batching, this_cpu);
>>  }
> 
> One other thing that occurs to me is that we might do the batching in
> the caller (by gathering a mask of pcpus in the vlapic code) but that
> sounds messy.  And it's not like softirq is so particularly
> latency-sensitive that we'd want to avoid this much overhead in the
> common case.  So on second thoughts this way is better. :)

In fact I had it that way first, and it looked ugly (apart from making
it more cumbersome for potential other use sites to switch to this
batching model).

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09 22:37   ` Andrew Cooper
@ 2014-09-10 10:29     ` Tim Deegan
  2014-09-10 10:37       ` Andrew Cooper
  0 siblings, 1 reply; 10+ messages in thread
From: Tim Deegan @ 2014-09-10 10:29 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson, Jan Beulich

At 23:37 +0100 on 09 Sep (1410302243), Andrew Cooper wrote:
> >> +void cpu_raise_softirq_batch_finish(void)
> >> +{
> >> +    unsigned int cpu, this_cpu = smp_processor_id();
> >> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
> > Again, this_cpu()?
> 
> ...But disagree here.  Multiple uses of this_cpu($FOO) cannot be
> coalesced due to RELOC_HIDE() deliberately preventing optimisation.  For
> multiple uses, pulling it out by pointer to start with results in rather
> more efficient code.

I wasn't questioning the pointer, but to the use of per_cpu(...,
this_cpu) instead of this_cpu(...).  Both of those involve a
RELOC_HIDE().

Anyway, it's pretty clear from your and Jan's replies that multiple
this_cpu() invocations are slower -- thanks for the clarification!

Tim.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-10 10:29     ` Tim Deegan
@ 2014-09-10 10:37       ` Andrew Cooper
  2014-09-10 11:58         ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Andrew Cooper @ 2014-09-10 10:37 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson, Jan Beulich

On 10/09/14 11:29, Tim Deegan wrote:
> At 23:37 +0100 on 09 Sep (1410302243), Andrew Cooper wrote:
>>>> +void cpu_raise_softirq_batch_finish(void)
>>>> +{
>>>> +    unsigned int cpu, this_cpu = smp_processor_id();
>>>> +    cpumask_t *mask = &per_cpu(batch_mask, this_cpu);
>>> Again, this_cpu()?
>> ...But disagree here.  Multiple uses of this_cpu($FOO) cannot be
>> coalesced due to RELOC_HIDE() deliberately preventing optimisation.  For
>> multiple uses, pulling it out by pointer to start with results in rather
>> more efficient code.
> I wasn't questioning the pointer, but to the use of per_cpu(...,
> this_cpu) instead of this_cpu(...).  Both of those involve a
> RELOC_HIDE().
>
> Anyway, it's pretty clear from your and Jan's replies that multiple
> this_cpu() invocations are slower -- thanks for the clarification!
>
> Tim.

The difference (if any) between per_cpu() vs this_cpu() depends on
whether the compiler decides to recalculate smp_processor_id() or not. 
The former is manual optimisation on behalf of the programmer.

I am beginning to wonder whether the use of __attribute__((const)) might
help with get_cpu_info().  Despite the explicit stack pointer reference
which is undoubtedly the source of optimisation confusion for the
compiler, inside a function, the result of get_cpu_info() is genuinely
never going to change, even though the compiler can't necessarily prove
the fact.

~Andrew

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-10 10:37       ` Andrew Cooper
@ 2014-09-10 11:58         ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2014-09-10 11:58 UTC (permalink / raw)
  To: Andrew Cooper, Tim Deegan
  Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson

>>> On 10.09.14 at 12:37, <andrew.cooper3@citrix.com> wrote:
> I am beginning to wonder whether the use of __attribute__((const)) might
> help with get_cpu_info().  Despite the explicit stack pointer reference
> which is undoubtedly the source of optimisation confusion for the
> compiler, inside a function, the result of get_cpu_info() is genuinely
> never going to change, even though the compiler can't necessarily prove
> the fact.

The const and pure attributes, at least when I last checked, get
dropped on the floor by gcc (or at least its code generator /
optimizer) for inline functions.

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, RFC] x86/HVM: batch vCPU wakeups
  2014-09-09 21:29 ` Tim Deegan
  2014-09-09 22:37   ` Andrew Cooper
  2014-09-10  9:28   ` Jan Beulich
@ 2014-09-10 13:10   ` Jan Beulich
  2 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2014-09-10 13:10 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Ian Campbell, xen-devel, Keir Fraser, Ian Jackson

[-- Attachment #1: Type: text/plain, Size: 1208 bytes --]

>>> On 09.09.14 at 23:29, <tim@xen.org> wrote:
> At 09:33 +0100 on 09 Sep (1410251617), Jan Beulich wrote:
>> RFC for two reasons:
>> 1) I started to consider elimination of the event-check IPIs in a more
>>    general way when MONITOR/MWAIT is in use: As long as the remote CPU
>>    is known to be MWAITing (or in the process of resuming after MWAIT),
>>    the write of softirq_pending(cpu) ought to be sufficient to wake it.
>>    This would yield the patch here pointless on MONITOR/MWAIT capable
>>    hardware.
> 
> That's a promising line.

And it alone gives quite a bit better performance than batching, at
least on the hardware I can directly test on. The maxima go down
even more significantly than the average values.

Putting the batching back on top improves the average delivery
times a little further, but surprisingly makes the maxima go back
up to almost where they were with just batching. So far I haven't
been able to think of an explanation for this. But while attaching
the raw patch here for reference, I'll want to wait with formal
submission until I've seen the numbers from the big box (also to
decide whether to include or drop the batching one).

Jan


[-- Attachment #2: x86-suppress-event-check-IPI.patch --]
[-- Type: text/plain, Size: 2728 bytes --]


One aspect worth noting is that cpumask_raise_softirq() gets brought in
sync here with cpu_raise_softirq() in that now both don't attempt to
raise a self-IPI on the processing CPU.

--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -340,6 +340,16 @@ void cpuidle_wakeup_mwait(cpumask_t *mas
     cpumask_andnot(mask, mask, &target);
 }
 
+bool_t arch_skip_send_event_check(unsigned int cpu)
+{
+    /*
+     * This relies on softirq_pending() and mwait_wakeup() to access data
+     * on the same cache line.
+     */
+    smp_mb();
+    return !!cpumask_test_cpu(cpu, &cpuidle_mwait_flags);
+}
+
 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 {
     unsigned int cpu = smp_processor_id();
@@ -359,7 +369,7 @@ void mwait_idle_with_hints(unsigned int 
      * Timer deadline passing is the event on which we will be woken via
      * cpuidle_mwait_wakeup. So check it now that the location is armed.
      */
-    if ( expires > NOW() || expires == 0 )
+    if ( (expires > NOW() || expires == 0) && !softirq_pending(cpu) )
     {
         cpumask_set_cpu(cpu, &cpuidle_mwait_flags);
         __mwait(eax, ecx);
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -70,12 +70,14 @@ void open_softirq(int nr, softirq_handle
 
 void cpumask_raise_softirq(const cpumask_t *mask, unsigned int nr)
 {
-    int cpu;
+    unsigned int cpu, this_cpu = smp_processor_id();
     cpumask_t send_mask;
 
     cpumask_clear(&send_mask);
     for_each_cpu(cpu, mask)
-        if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )
+        if ( !test_and_set_bit(nr, &softirq_pending(cpu)) &&
+             cpu != this_cpu &&
+             !arch_skip_send_event_check(cpu) )
             cpumask_set_cpu(cpu, &send_mask);
 
     smp_send_event_check_mask(&send_mask);
@@ -84,7 +86,8 @@ void cpumask_raise_softirq(const cpumask
 void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 {
     if ( !test_and_set_bit(nr, &softirq_pending(cpu))
-         && (cpu != smp_processor_id()) )
+         && (cpu != smp_processor_id())
+         && !arch_skip_send_event_check(cpu) )
         smp_send_event_check_cpu(cpu);
 }
 
--- a/xen/include/asm-arm/softirq.h
+++ b/xen/include/asm-arm/softirq.h
@@ -3,6 +3,8 @@
 
 #define NR_ARCH_SOFTIRQS       0
 
+#define arch_skip_send_event_check(cpu) 0
+
 #endif /* __ASM_SOFTIRQ_H__ */
 /*
  * Local variables:
--- a/xen/include/asm-x86/softirq.h
+++ b/xen/include/asm-x86/softirq.h
@@ -9,4 +9,6 @@
 #define PCI_SERR_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
 #define NR_ARCH_SOFTIRQS       5
 
+bool_t arch_skip_send_event_check(unsigned int cpu);
+
 #endif /* __ASM_SOFTIRQ_H__ */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-09-10 13:10 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-09  8:33 [PATCH, RFC] x86/HVM: batch vCPU wakeups Jan Beulich
2014-09-09  9:33 ` Pasi Kärkkäinen
2014-09-09  9:54   ` Jan Beulich
2014-09-09 21:29 ` Tim Deegan
2014-09-09 22:37   ` Andrew Cooper
2014-09-10 10:29     ` Tim Deegan
2014-09-10 10:37       ` Andrew Cooper
2014-09-10 11:58         ` Jan Beulich
2014-09-10  9:28   ` Jan Beulich
2014-09-10 13:10   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.