[PATCH] CPUIDLE: shorten hpet spin

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] CPUIDLE: shorten hpet spin_lock holding time
@ 2010-04-20  5:39 Wei, Gang
  2010-04-20 12:49 ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Wei, Gang @ 2010-04-20  5:39 UTC (permalink / raw)
  To: xen-devel@lists.xensource.com; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 2706 bytes --]

CPUIDLE: shorten hpet spin_lock holding time

Try to reduce spin_lock overhead for deep C state entry/exit. This will benefit systems with a lot of cpus which need the hpet broadcast to wakeup from deep C state.

Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r 7ee8bb40200a xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c	Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/hpet.c	Fri Apr 16 15:05:28 2010 +0800
@@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
 
 again:
     ch->next_event = STIME_MAX;
+
+    spin_unlock_irq(&ch->lock);
+
     next_event = STIME_MAX;
     mask = (cpumask_t)CPU_MASK_NONE;
     now = NOW();
@@ -204,10 +207,14 @@ again:
 
     if ( next_event != STIME_MAX )
     {
-        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
+        spin_lock_irq(&ch->lock);
+
+        if ( next_event < ch->next_event &&
+             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
-    }
-    spin_unlock_irq(&ch->lock);
+
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 static void hpet_interrupt_handler(int irq, void *data,
@@ -656,10 +663,15 @@ void hpet_broadcast_enter(void)
     BUG_ON( !ch );
 
     ASSERT(!local_irq_is_enabled());
-    spin_lock(&ch->lock);
 
     if ( hpet_attach_channel )
+    {
+        spin_lock(&ch->lock);
+
         hpet_attach_channel(cpu, ch);
+
+        spin_unlock(&ch->lock);
+    }
 
     /* Cancel any outstanding LAPIC timer event and disable interrupts. */
     reprogram_timer(0);
@@ -667,6 +679,8 @@ void hpet_broadcast_enter(void)
 
     cpu_set(cpu, ch->cpumask);
 
+    spin_lock(&ch->lock);
+
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline_end) < ch->next_event )
         reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(), 1);
@@ -683,8 +697,6 @@ void hpet_broadcast_exit(void)
         return;
 
     BUG_ON( !ch );
-
-    spin_lock_irq(&ch->lock);
 
     if ( cpu_test_and_clear(cpu, ch->cpumask) )
     {
@@ -693,14 +705,22 @@ void hpet_broadcast_exit(void)
         if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
             raise_softirq(TIMER_SOFTIRQ);
 
+        spin_lock_irq(&ch->lock);
+
         if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
             reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+
+        spin_unlock_irq(&ch->lock);
     }
 
     if ( hpet_detach_channel )
+    {
+        spin_lock_irq(&ch->lock);
+
         hpet_detach_channel(cpu);
 
-    spin_unlock_irq(&ch->lock);
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 int hpet_broadcast_is_available(void)

[-- Attachment #2: hpet_short_lock.patch --]
[-- Type: application/octet-stream, Size: 2612 bytes --]

CPUIDLE: shorten hpet spin_lock holding time

Try to reduce spin_lock overhead for deep C state entry/exit. This will benefit systems with a lot of cpus which need the hpet broadcast to wakeup from deep C state.

Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r 7ee8bb40200a xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c	Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/hpet.c	Fri Apr 16 15:05:28 2010 +0800
@@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
 
 again:
     ch->next_event = STIME_MAX;
+
+    spin_unlock_irq(&ch->lock);
+
     next_event = STIME_MAX;
     mask = (cpumask_t)CPU_MASK_NONE;
     now = NOW();
@@ -204,10 +207,14 @@ again:
 
     if ( next_event != STIME_MAX )
     {
-        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
+        spin_lock_irq(&ch->lock);
+
+        if ( next_event < ch->next_event &&
+             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
-    }
-    spin_unlock_irq(&ch->lock);
+
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 static void hpet_interrupt_handler(int irq, void *data,
@@ -656,10 +663,15 @@ void hpet_broadcast_enter(void)
     BUG_ON( !ch );
 
     ASSERT(!local_irq_is_enabled());
-    spin_lock(&ch->lock);
 
     if ( hpet_attach_channel )
+    {
+        spin_lock(&ch->lock);
+
         hpet_attach_channel(cpu, ch);
+
+        spin_unlock(&ch->lock);
+    }
 
     /* Cancel any outstanding LAPIC timer event and disable interrupts. */
     reprogram_timer(0);
@@ -667,6 +679,8 @@ void hpet_broadcast_enter(void)
 
     cpu_set(cpu, ch->cpumask);
 
+    spin_lock(&ch->lock);
+
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline_end) < ch->next_event )
         reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(), 1);
@@ -683,8 +697,6 @@ void hpet_broadcast_exit(void)
         return;
 
     BUG_ON( !ch );
-
-    spin_lock_irq(&ch->lock);
 
     if ( cpu_test_and_clear(cpu, ch->cpumask) )
     {
@@ -693,14 +705,22 @@ void hpet_broadcast_exit(void)
         if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
             raise_softirq(TIMER_SOFTIRQ);
 
+        spin_lock_irq(&ch->lock);
+
         if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
             reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+
+        spin_unlock_irq(&ch->lock);
     }
 
     if ( hpet_detach_channel )
+    {
+        spin_lock_irq(&ch->lock);
+
         hpet_detach_channel(cpu);
 
-    spin_unlock_irq(&ch->lock);
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 int hpet_broadcast_is_available(void)

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20  5:39 [PATCH] CPUIDLE: shorten hpet spin_lock holding time Wei, Gang
@ 2010-04-20 12:49 ` Keir Fraser
  2010-04-20 14:04   ` Wei, Gang
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-20 12:49 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

Is this a measurable win? The newer locking looks like it could be dodgy on
32-bit Xen: the 64-bit reads of timer_deadline_{start,end} will be
non-atomic and unsynchronised so you can read garbage. Even on 64-bit Xen
you can read stale values. I'll be surprised if you got a performance win
from chopping up critical regions in individual functions like that anyway.

 -- Keir

On 20/04/2010 06:39, "Wei, Gang" <gang.wei@intel.com> wrote:

> CPUIDLE: shorten hpet spin_lock holding time
> 
> Try to reduce spin_lock overhead for deep C state entry/exit. This will
> benefit systems with a lot of cpus which need the hpet broadcast to wakeup
> from deep C state.
> 
> Signed-off-by: Wei Gang <gang.wei@intel.com>
> 
> diff -r 7ee8bb40200a xen/arch/x86/hpet.c
> --- a/xen/arch/x86/hpet.c Thu Apr 15 19:11:16 2010 +0100
> +++ b/xen/arch/x86/hpet.c Fri Apr 16 15:05:28 2010 +0800
> @@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
>  
>  again:
>      ch->next_event = STIME_MAX;
> +
> +    spin_unlock_irq(&ch->lock);
> +
>      next_event = STIME_MAX;
>      mask = (cpumask_t)CPU_MASK_NONE;
>      now = NOW();
> @@ -204,10 +207,14 @@ again:
>  
>      if ( next_event != STIME_MAX )
>      {
> -        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
> +        spin_lock_irq(&ch->lock);
> +
> +        if ( next_event < ch->next_event &&
> +             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
>              goto again;
> -    }
> -    spin_unlock_irq(&ch->lock);
> +
> +        spin_unlock_irq(&ch->lock);
> +    }
>  }
>  
>  static void hpet_interrupt_handler(int irq, void *data,
> @@ -656,10 +663,15 @@ void hpet_broadcast_enter(void)
>      BUG_ON( !ch );
>  
>      ASSERT(!local_irq_is_enabled());
> -    spin_lock(&ch->lock);
>  
>      if ( hpet_attach_channel )
> +    {
> +        spin_lock(&ch->lock);
> +
>          hpet_attach_channel(cpu, ch);
> +
> +        spin_unlock(&ch->lock);
> +    }
>  
>      /* Cancel any outstanding LAPIC timer event and disable interrupts. */
>      reprogram_timer(0);
> @@ -667,6 +679,8 @@ void hpet_broadcast_enter(void)
>  
>      cpu_set(cpu, ch->cpumask);
>  
> +    spin_lock(&ch->lock);
> +
>      /* reprogram if current cpu expire time is nearer */
>      if ( this_cpu(timer_deadline_end) < ch->next_event )
>          reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(),
> 1);
> @@ -683,8 +697,6 @@ void hpet_broadcast_exit(void)
>          return;
>  
>      BUG_ON( !ch );
> -
> -    spin_lock_irq(&ch->lock);
>  
>      if ( cpu_test_and_clear(cpu, ch->cpumask) )
>      {
> @@ -693,14 +705,22 @@ void hpet_broadcast_exit(void)
>          if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
>              raise_softirq(TIMER_SOFTIRQ);
>  
> +        spin_lock_irq(&ch->lock);
> +
>          if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
>              reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
> +
> +        spin_unlock_irq(&ch->lock);
>      }
>  
>      if ( hpet_detach_channel )
> +    {
> +        spin_lock_irq(&ch->lock);
> +
>          hpet_detach_channel(cpu);
>  
> -    spin_unlock_irq(&ch->lock);
> +        spin_unlock_irq(&ch->lock);
> +    }
>  }
>  
>  int hpet_broadcast_is_available(void)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20 12:49 ` Keir Fraser
@ 2010-04-20 14:04   ` Wei, Gang
  2010-04-20 14:21     ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Wei, Gang @ 2010-04-20 14:04 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

On Tuesday, 2010-4-20 8:49 PM, Keir Fraser wrote:
> Is this a measurable win? The newer locking looks like it could be
> dodgy on 32-bit Xen: the 64-bit reads of timer_deadline_{start,end}
> will be non-atomic and unsynchronised so you can read garbage. Even
> on 64-bit Xen you can read stale values. I'll be surprised if you got
> a performance win from chopping up critical regions in individual
> functions like that anyway. 

First of all, this is a measurable power win for cpu overcommitment idle case (vcpu:pcpu > 2:1, pcpu >= 32, guests are non-tickless kernels).

As to the non-atomic access to timer_deadline_{start,end}, it should already be there before this patch. It is not protected by the hpet lock. Shall we add rw_lock for each timer_deadline_{start,end}? This can be done separately.

Jimmy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20 14:04   ` Wei, Gang
@ 2010-04-20 14:21     ` Keir Fraser
  2010-04-20 15:20       ` Wei, Gang
  2010-04-20 16:05       ` Wei, Gang
  0 siblings, 2 replies; 19+ messages in thread
From: Keir Fraser @ 2010-04-20 14:21 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 20/04/2010 15:04, "Wei, Gang" <gang.wei@intel.com> wrote:

> On Tuesday, 2010-4-20 8:49 PM, Keir Fraser wrote:
>> Is this a measurable win? The newer locking looks like it could be
>> dodgy on 32-bit Xen: the 64-bit reads of timer_deadline_{start,end}
>> will be non-atomic and unsynchronised so you can read garbage. Even
>> on 64-bit Xen you can read stale values. I'll be surprised if you got
>> a performance win from chopping up critical regions in individual
>> functions like that anyway.
> 
> First of all, this is a measurable power win for cpu overcommitment idle case
> (vcpu:pcpu > 2:1, pcpu >= 32, guests are non-tickless kernels).

So lots of short sleep periods, and possibly only a very few HPET channels
to share? How prevalent is always-running APIC timer now, and is that going
to be supported in future processors?

> As to the non-atomic access to timer_deadline_{start,end}, it should already
> be there before this patch. It is not protected by the hpet lock. Shall we add
> rw_lock for each timer_deadline_{start,end}? This can be done separately.

The bug isn't previously there, since the fields will not be read unless the
cpu is in ch->cpumask, which (was) protected by ch->lock. That was
sufficient since a CPU would not modify timer_deadline_{start,end} between
hpet_broadcast_enter and hpet_broadcast_exit. After your patch,
handle_hpet_broadcast is no longer fully synchronised against those
functions.

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20 14:21     ` Keir Fraser
@ 2010-04-20 15:20       ` Wei, Gang
  2010-04-20 16:05       ` Wei, Gang
  1 sibling, 0 replies; 19+ messages in thread
From: Wei, Gang @ 2010-04-20 15:20 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

On Tuesday, 2010-4-20 10:22 PM, Keir Fraser wrote:
> On 20/04/2010 15:04, "Wei, Gang" <gang.wei@intel.com> wrote:
> 
>> On Tuesday, 2010-4-20 8:49 PM, Keir Fraser wrote:
>>> Is this a measurable win? The newer locking looks like it could be
>>> dodgy on 32-bit Xen: the 64-bit reads of timer_deadline_{start,end}
>>> will be non-atomic and unsynchronised so you can read garbage. Even
>>> on 64-bit Xen you can read stale values. I'll be surprised if you
>>> got a performance win from chopping up critical regions in
>>> individual functions like that anyway.
>> 
>> First of all, this is a measurable power win for cpu overcommitment
>> idle case (vcpu:pcpu > 2:1, pcpu >= 32, guests are non-tickless
>> kernels). 
> 
> So lots of short sleep periods, and possibly only a very few HPET
> channels to share? 

Yes.

> How prevalent is always-running APIC timer now,
> and is that going to be supported in future processors?

Always-running APIC timer just started from Westmere. And I personally guess it should be supported in future processors. There are a lot of existing system still need hpet broadcast wakeup.

>> As to the non-atomic access to timer_deadline_{start,end}, it should
>> already be there before this patch. It is not protected by the hpet
>> lock. Shall we add rw_lock for each timer_deadline_{start,end}? This
>> can be done separately. 
> 
> The bug isn't previously there, since the fields will not be read
> unless the cpu is in ch->cpumask, which (was) protected by ch->lock.
> That was sufficient since a CPU would not modify
> timer_deadline_{start,end} between hpet_broadcast_enter and
> hpet_broadcast_exit. After your patch, handle_hpet_broadcast is no
> longer fully synchronised against those functions.

Ok, your are right. So we may just need to make sure the cpu is in ch->cpumask while reading the timer_deadline_{start,end}. I can make some changes to my patch.

Jimmy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20 14:21     ` Keir Fraser
  2010-04-20 15:20       ` Wei, Gang
@ 2010-04-20 16:05       ` Wei, Gang
  2010-04-21  8:09         ` Keir Fraser
  2010-04-22  8:21         ` Keir Fraser
  1 sibling, 2 replies; 19+ messages in thread
From: Wei, Gang @ 2010-04-20 16:05 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 3865 bytes --]

Resend.

CPUIDLE: shorten hpet spin_lock holding time

Try to reduce spin_lock overhead for deep C state entry/exit. This will benefit systems with a lot of cpus which need the hpet broadcast to wakeup from deep C state.

Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r dbf0fd95180f xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c	Tue Apr 20 14:32:53 2010 +0100
+++ b/xen/arch/x86/hpet.c	Tue Apr 20 23:48:19 2010 +0800
@@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
 
 again:
     ch->next_event = STIME_MAX;
+
+    spin_unlock_irq(&ch->lock);
+
     next_event = STIME_MAX;
     mask = (cpumask_t)CPU_MASK_NONE;
     now = NOW();
@@ -193,10 +196,17 @@ again:
     /* find all expired events */
     for_each_cpu_mask(cpu, ch->cpumask)
     {
-        if ( per_cpu(timer_deadline_start, cpu) <= now )
-            cpu_set(cpu, mask);
-        else if ( per_cpu(timer_deadline_end, cpu) < next_event )
-            next_event = per_cpu(timer_deadline_end, cpu);
+        spin_lock_irq(&ch->lock);
+
+        if ( cpumask_test_cpu(cpu, ch->cpumask) )
+        {
+            if ( per_cpu(timer_deadline_start, cpu) <= now )
+                cpu_set(cpu, mask);
+            else if ( per_cpu(timer_deadline_end, cpu) < next_event )
+                next_event = per_cpu(timer_deadline_end, cpu);
+        }
+
+        spin_unlock_irq(&ch->lock);
     }
 
     /* wakeup the cpus which have an expired event. */
@@ -204,10 +214,14 @@ again:
 
     if ( next_event != STIME_MAX )
     {
-        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
+        spin_lock_irq(&ch->lock);
+
+        if ( next_event < ch->next_event &&
+             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
-    }
-    spin_unlock_irq(&ch->lock);
+
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 static void hpet_interrupt_handler(int irq, void *data,
@@ -656,17 +670,23 @@ void hpet_broadcast_enter(void)
     BUG_ON( !ch );
 
     ASSERT(!local_irq_is_enabled());
-    spin_lock(&ch->lock);
 
     if ( hpet_attach_channel )
+    {
+        spin_lock(&ch->lock);
+
         hpet_attach_channel(cpu, ch);
+
+        spin_unlock(&ch->lock);
+    }
 
     /* Cancel any outstanding LAPIC timer event and disable interrupts. */
     reprogram_timer(0);
     disable_APIC_timer();
 
+    spin_lock(&ch->lock);
+
     cpu_set(cpu, ch->cpumask);
-
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline_end) < ch->next_event )
         reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(), 1);
@@ -684,23 +704,28 @@ void hpet_broadcast_exit(void)
 
     BUG_ON( !ch );
 
+    /* Reprogram the deadline; trigger timer work now if it has passed. */
+    enable_APIC_timer();
+    if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
+        raise_softirq(TIMER_SOFTIRQ);
+
     spin_lock_irq(&ch->lock);
 
-    if ( cpu_test_and_clear(cpu, ch->cpumask) )
-    {
-        /* Reprogram the deadline; trigger timer work now if it has passed. */
-        enable_APIC_timer();
-        if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
-            raise_softirq(TIMER_SOFTIRQ);
-
-        if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
-            reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
-    }
+    cpu_clear(cpu, ch->cpumask);
+    if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
+        reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+
+    spin_unlock_irq(&ch->lock);
+
 
     if ( hpet_detach_channel )
+    {
+        spin_lock_irq(&ch->lock);
+
         hpet_detach_channel(cpu);
 
-    spin_unlock_irq(&ch->lock);
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 int hpet_broadcast_is_available(void)

[-- Attachment #2: hpet_short_lock_v2.patch --]
[-- Type: application/octet-stream, Size: 3731 bytes --]

CPUIDLE: shorten hpet spin_lock holding time

Try to reduce spin_lock overhead for deep C state entry/exit. This will benefit systems with a lot of cpus which need the hpet broadcast to wakeup from deep C state.

Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r dbf0fd95180f xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c	Tue Apr 20 14:32:53 2010 +0100
+++ b/xen/arch/x86/hpet.c	Tue Apr 20 23:48:19 2010 +0800
@@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
 
 again:
     ch->next_event = STIME_MAX;
+
+    spin_unlock_irq(&ch->lock);
+
     next_event = STIME_MAX;
     mask = (cpumask_t)CPU_MASK_NONE;
     now = NOW();
@@ -193,10 +196,17 @@ again:
     /* find all expired events */
     for_each_cpu_mask(cpu, ch->cpumask)
     {
-        if ( per_cpu(timer_deadline_start, cpu) <= now )
-            cpu_set(cpu, mask);
-        else if ( per_cpu(timer_deadline_end, cpu) < next_event )
-            next_event = per_cpu(timer_deadline_end, cpu);
+        spin_lock_irq(&ch->lock);
+
+        if ( cpumask_test_cpu(cpu, ch->cpumask) )
+        {
+            if ( per_cpu(timer_deadline_start, cpu) <= now )
+                cpu_set(cpu, mask);
+            else if ( per_cpu(timer_deadline_end, cpu) < next_event )
+                next_event = per_cpu(timer_deadline_end, cpu);
+        }
+
+        spin_unlock_irq(&ch->lock);
     }
 
     /* wakeup the cpus which have an expired event. */
@@ -204,10 +214,14 @@ again:
 
     if ( next_event != STIME_MAX )
     {
-        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
+        spin_lock_irq(&ch->lock);
+
+        if ( next_event < ch->next_event &&
+             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
-    }
-    spin_unlock_irq(&ch->lock);
+
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 static void hpet_interrupt_handler(int irq, void *data,
@@ -656,17 +670,23 @@ void hpet_broadcast_enter(void)
     BUG_ON( !ch );
 
     ASSERT(!local_irq_is_enabled());
-    spin_lock(&ch->lock);
 
     if ( hpet_attach_channel )
+    {
+        spin_lock(&ch->lock);
+
         hpet_attach_channel(cpu, ch);
+
+        spin_unlock(&ch->lock);
+    }
 
     /* Cancel any outstanding LAPIC timer event and disable interrupts. */
     reprogram_timer(0);
     disable_APIC_timer();
 
+    spin_lock(&ch->lock);
+
     cpu_set(cpu, ch->cpumask);
-
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline_end) < ch->next_event )
         reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(), 1);
@@ -684,23 +704,28 @@ void hpet_broadcast_exit(void)
 
     BUG_ON( !ch );
 
+    /* Reprogram the deadline; trigger timer work now if it has passed. */
+    enable_APIC_timer();
+    if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
+        raise_softirq(TIMER_SOFTIRQ);
+
     spin_lock_irq(&ch->lock);
 
-    if ( cpu_test_and_clear(cpu, ch->cpumask) )
-    {
-        /* Reprogram the deadline; trigger timer work now if it has passed. */
-        enable_APIC_timer();
-        if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
-            raise_softirq(TIMER_SOFTIRQ);
-
-        if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
-            reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
-    }
+    cpu_clear(cpu, ch->cpumask);
+    if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
+        reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+
+    spin_unlock_irq(&ch->lock);
+
 
     if ( hpet_detach_channel )
+    {
+        spin_lock_irq(&ch->lock);
+
         hpet_detach_channel(cpu);
 
-    spin_unlock_irq(&ch->lock);
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 int hpet_broadcast_is_available(void)

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20 16:05       ` Wei, Gang
@ 2010-04-21  8:09         ` Keir Fraser
  2010-04-21  9:06           ` Wei, Gang
  2010-04-22  8:21         ` Keir Fraser
  1 sibling, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-21  8:09 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 20/04/2010 17:05, "Wei, Gang" <gang.wei@intel.com> wrote:

> Resend.
> 
> CPUIDLE: shorten hpet spin_lock holding time
> 
> Try to reduce spin_lock overhead for deep C state entry/exit. This will
> benefit systems with a lot of cpus which need the hpet broadcast to wakeup
> from deep C state.
> 
> Signed-off-by: Wei Gang <gang.wei@intel.com>

It fixes the unsafe accesses to timer_deadline_{start,end} but I still think
this optimisation is misguided and also unsafe. There is nothing to stop new
CPUs being added to ch->cpumask after you start scanning ch->cpumask. For
example, some new CPU which has a timer_deadline_end greater than
ch->next_event, so it does not reprogram the HPET. But handle_hpet_broadcast
is already mid-scan and misses this new CPU, so it does not reprogram the
HPET either. Hence no timer fires for the new CPU and it misses its
deadline.

Really I think a better approach than something like this patch would be to
better advertise the timer_slop=xxx Xen boot parameter for power-saving
scenarios. I wonder what your numbers look like if you re-run your benchmark
with (say) timer_slop=10000000 (i.e., 10ms slop) on the Xen command line?

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-21  8:09         ` Keir Fraser
@ 2010-04-21  9:06           ` Wei, Gang
  2010-04-21  9:25             ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Wei, Gang @ 2010-04-21  9:06 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

On Wednesday, 2010-4-21 4:10 PM, Keir Fraser wrote:
> It fixes the unsafe accesses to timer_deadline_{start,end} but I
> still think this optimisation is misguided and also unsafe. There is
> nothing to stop new CPUs being added to ch->cpumask after you start
> scanning ch->cpumask. For example, some new CPU which has a
> timer_deadline_end greater than ch->next_event, so it does not
> reprogram the HPET. But handle_hpet_broadcast is already mid-scan and
> misses this new CPU, so it does not reprogram the HPET either. Hence
> no timer fires for the new CPU and it misses its deadline.

This will not happen. ch->next_event has already been set as STIME_MAX before start scanning ch->cpumask, so the new CPU with smallest timer_deadline_end will reprogram the HPET successfully.

> Really I think a better approach than something like this patch would
> be to better advertise the timer_slop=xxx Xen boot parameter for
> power-saving scenarios. I wonder what your numbers look like if you
> re-run your benchmark with (say) timer_slop=10000000 (i.e., 10ms
> slop) on the Xen command line? 

I think it is another story. Enlarging timer_slop is one way to aligned & reduce breakevents, it do have effects to save power and possibly bring larger latency. What I am trying to address here is how to reduce spin_lock overheads in idel entry/exit path. The spin_lock overheads along with other overheads in the system with 32pcpu/64vcpu caused >25% cpu utilization while all guest are idle.

Jimmy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-21  9:06           ` Wei, Gang
@ 2010-04-21  9:25             ` Keir Fraser
  2010-04-21  9:36               ` Wei, Gang
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-21  9:25 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 21/04/2010 10:06, "Wei, Gang" <gang.wei@intel.com> wrote:

>> It fixes the unsafe accesses to timer_deadline_{start,end} but I
>> still think this optimisation is misguided and also unsafe. There is
>> nothing to stop new CPUs being added to ch->cpumask after you start
>> scanning ch->cpumask. For example, some new CPU which has a
>> timer_deadline_end greater than ch->next_event, so it does not
>> reprogram the HPET. But handle_hpet_broadcast is already mid-scan and
>> misses this new CPU, so it does not reprogram the HPET either. Hence
>> no timer fires for the new CPU and it misses its deadline.
> 
> This will not happen. ch->next_event has already been set as STIME_MAX before
> start scanning ch->cpumask, so the new CPU with smallest timer_deadline_end
> will reprogram the HPET successfully.

Okay, then CPU A executes hpet_broadcast_enter() and programs the HPET
channel for its timeout X. Meanwhile concurrently executing
handle_hpet_broadcast misses CPU A but finds some other CPU B with timeout Y
much later than X, and erroneously programs the HPET channel with Y, causing
CPU A to miss its deadline by an arbitrary amount.

I dare say I can carry on finding races. :-)

> I think it is another story. Enlarging timer_slop is one way to aligned &
> reduce breakevents, it do have effects to save power and possibly bring larger
> latency. What I am trying to address here is how to reduce spin_lock overheads
> in idel entry/exit path. The spin_lock overheads along with other overheads in
> the system with 32pcpu/64vcpu caused >25% cpu utilization while all guest are
> idle.

So far it's looked to me like a correctness/performance tradeoff. :-D

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-21  9:25             ` Keir Fraser
@ 2010-04-21  9:36               ` Wei, Gang
  2010-04-21  9:52                 ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Wei, Gang @ 2010-04-21  9:36 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

On Wednesday, 2010-4-21 5:26 PM, Keir Fraser wrote:
> On 21/04/2010 10:06, "Wei, Gang" <gang.wei@intel.com> wrote:
> 
>>> It fixes the unsafe accesses to timer_deadline_{start,end} but I
>>> still think this optimisation is misguided and also unsafe. There is
>>> nothing to stop new CPUs being added to ch->cpumask after you start
>>> scanning ch->cpumask. For example, some new CPU which has a
>>> timer_deadline_end greater than ch->next_event, so it does not
>>> reprogram the HPET. But handle_hpet_broadcast is already mid-scan
>>> and misses this new CPU, so it does not reprogram the HPET either.
>>> Hence no timer fires for the new CPU and it misses its deadline.
>> 
>> This will not happen. ch->next_event has already been set as
>> STIME_MAX before start scanning ch->cpumask, so the new CPU with
>> smallest timer_deadline_end will reprogram the HPET successfully.
> 
> Okay, then CPU A executes hpet_broadcast_enter() and programs the HPET
> channel for its timeout X. Meanwhile concurrently executing
> handle_hpet_broadcast misses CPU A but finds some other CPU B with
> timeout Y much later than X, and erroneously programs the HPET
> channel with Y, causing CPU A to miss its deadline by an arbitrary
> amount. 

It is also not possible. handle_hpet_broadcast reprogram HPET just while next_event < ch->next_event. Here next_evet is Y, and ch->next_event is X. :-)

> I dare say I can carry on finding races. :-)

Please go on... The two racing conditions you mentioned were just considered before I resent the patch. :-D

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-21  9:36               ` Wei, Gang
@ 2010-04-21  9:52                 ` Keir Fraser
  2010-04-21 10:03                   ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-21  9:52 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 21/04/2010 10:36, "Wei, Gang" <gang.wei@intel.com> wrote:

>> Okay, then CPU A executes hpet_broadcast_enter() and programs the HPET
>> channel for its timeout X. Meanwhile concurrently executing
>> handle_hpet_broadcast misses CPU A but finds some other CPU B with
>> timeout Y much later than X, and erroneously programs the HPET
>> channel with Y, causing CPU A to miss its deadline by an arbitrary
>> amount. 
> 
> It is also not possible. handle_hpet_broadcast reprogram HPET just while
> next_event < ch->next_event. Here next_evet is Y, and ch->next_event is X. :-)

Ah yes, I spotted that just after I replied! Okay I'm almost convinced now,
but...

>> I dare say I can carry on finding races. :-)
> 
> Please go on... The two racing conditions you mentioned were just considered
> before I resent the patch. :-D

Okay, one concern I still have is over possible races around
cpuidle_wakeup_mwait(). It makes use of a cpumask cpuidle_mwait_flags,
avoiding an IPI to cpus in the mask. However, there is nothing to stop the
CPU having cleared itself from that cpumask before cpuidle does the write to
softirq_pending. In that case, even assuming the CPU is now non-idle and so
wakeup is spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ)
will incorrectly not IPI because the flag is already set in softirq_pending?

This race would be benign in the current locking strategy (without your
patch) because the CPU that has left mwait_idle_with_hints() cannot get
further than hpet_broadcast_exit() because it will spin on ch->lock, and
there is a guaranteed check of softirq_pending at some point after that.
However your patch removes that synchronisation because ch->lock is not held
across cpuidle_wakeup_mwait().

What do you think to that?

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-21  9:52                 ` Keir Fraser
@ 2010-04-21 10:03                   ` Keir Fraser
  2010-04-22  3:59                     ` Wei, Gang
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-21 10:03 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 21/04/2010 10:52, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

> Okay, one concern I still have is over possible races around
> cpuidle_wakeup_mwait(). It makes use of a cpumask cpuidle_mwait_flags,
> avoiding an IPI to cpus in the mask. However, there is nothing to stop the
> CPU having cleared itself from that cpumask before cpuidle does the write to
> softirq_pending. In that case, even assuming the CPU is now non-idle and so
> wakeup is spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ)
> will incorrectly not IPI because the flag is already set in softirq_pending?

Oh, another one, which can **even occur without your patch**: CPU A adds
itself to cpuidle_mwait_flags while cpuidle_wakeup_mwait() is running. That
function doesn't see CPU A in its first read of the cpumask so it does not
set TIMER_SOFTIRQ for CPU A. But it then *does* see CPU A in its final read
of the cpumask, and hence clears A from the caller's mask. Hence the caller
does not pass CPU A to cpumask_raise_softirq(). Hence CPU A is erroneously
not woken.

Isn't the synchronisation around those mwait/monitor functions just
inherently broken, even without your patch, and your patch just makes it
worse? :-)

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-21 10:03                   ` Keir Fraser
@ 2010-04-22  3:59                     ` Wei, Gang
  2010-04-22  7:22                       ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Wei, Gang @ 2010-04-22  3:59 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

On Wednesday, 2010-4-21 6:03 PM, Keir Fraser wrote:
> On 21/04/2010 10:52, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> 
>> Okay, one concern I still have is over possible races around
>> cpuidle_wakeup_mwait(). It makes use of a cpumask
>> cpuidle_mwait_flags, avoiding an IPI to cpus in the mask. However,
>> there is nothing to stop the CPU having cleared itself from that
>> cpumask before cpuidle does the write to softirq_pending. In that
>> case, even assuming the CPU is now non-idle and so wakeup is
>> spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ) will
>> incorrectly not IPI because the flag is already set in
>> softirq_pending?  

If a CPU cleared itself from cpuidle_mwait_flags, then this CPU didn't need a IPI to be waken. And one useless write to softirq_pending doesn't have any side effect. So this case should be acceptable.

> Oh, another one, which can **even occur without your patch**: CPU A
> adds itself to cpuidle_mwait_flags while cpuidle_wakeup_mwait() is
> running. That function doesn't see CPU A in its first read of the
> cpumask so it does not set TIMER_SOFTIRQ for CPU A. But it then
> *does* see CPU A in its final read of the cpumask, and hence clears A
> from the caller's mask. Hence the caller does not pass CPU A to
> cpumask_raise_softirq(). Hence CPU A is erroneously not woken.

Nice shot. This issue can be resolved via keeping and using a snapshot of cpuidle_mwait_flags in cpuidle_wakeup_mwait. 

> Isn't the synchronisation around those mwait/monitor functions just
> inherently broken, even without your patch, and your patch just makes
> it worse? :-)

How about now after fixing the second concern you mentioned?

Jimmy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-22  3:59                     ` Wei, Gang
@ 2010-04-22  7:22                       ` Keir Fraser
  2010-04-22  8:19                         ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-22  7:22 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 22/04/2010 04:59, "Wei, Gang" <gang.wei@intel.com> wrote:

>>> Okay, one concern I still have is over possible races around
>>> cpuidle_wakeup_mwait(). It makes use of a cpumask
>>> cpuidle_mwait_flags, avoiding an IPI to cpus in the mask. However,
>>> there is nothing to stop the CPU having cleared itself from that
>>> cpumask before cpuidle does the write to softirq_pending. In that
>>> case, even assuming the CPU is now non-idle and so wakeup is
>>> spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ) will
>>> incorrectly not IPI because the flag is already set in
>>> softirq_pending?
> 
> If a CPU cleared itself from cpuidle_mwait_flags, then this CPU didn't need a
> IPI to be waken. And one useless write to softirq_pending doesn't have any
> side effect. So this case should be acceptable.

That's not totally convincing. The write to softirq_pending has one extra
side effect: it is possible that the next time TIMER_SOFTIRQ really needs to
be raised on that CPU, it will not receive notification via IPI, because the
flag is already set in its softirq_pending mask.

Hm, let me see if I can come up with a patch for this and post it for you.

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-22  7:22                       ` Keir Fraser
@ 2010-04-22  8:19                         ` Keir Fraser
  2010-04-22  8:23                           ` Keir Fraser
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-22  8:19 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 955 bytes --]

On 22/04/2010 08:22, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

>> If a CPU cleared itself from cpuidle_mwait_flags, then this CPU didn't need a
>> IPI to be waken. And one useless write to softirq_pending doesn't have any
>> side effect. So this case should be acceptable.
> 
> That's not totally convincing. The write to softirq_pending has one extra
> side effect: it is possible that the next time TIMER_SOFTIRQ really needs to
> be raised on that CPU, it will not receive notification via IPI, because the
> flag is already set in its softirq_pending mask.
> 
> Hm, let me see if I can come up with a patch for this and post it for you.

How about the attached patch? It MWAITs on a completely new flag field,
avoiding the IPI-avoidance semantics of softirq_pending. It also folds in
your patch. It also does wakeup-waiting checks on timer_deadline_start, that
being the field that initiates wakeup via the MONITORed memory region.

 -- Keir


[-- Attachment #2: 00-mwait --]
[-- Type: application/octet-stream, Size: 1941 bytes --]

diff -r b0562b298d73 xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c	Wed Apr 21 12:51:53 2010 +0100
+++ b/xen/arch/x86/acpi/cpu_idle.c	Thu Apr 22 09:17:36 2010 +0100
@@ -153,40 +153,45 @@
 /*
  * The bit is set iff cpu use monitor/mwait to enter C state
  * with this flag set, CPU can be waken up from C state
- * by writing to specific memory address, instead of sending IPI
+ * by writing to specific memory address, instead of sending an IPI.
  */
 static cpumask_t cpuidle_mwait_flags;
+static DEFINE_PER_CPU(bool_t, cpuidle_mwait_wakeup);
 
 void cpuidle_wakeup_mwait(cpumask_t *mask)
 {
     cpumask_t target;
-    int cpu;
+    unsigned int cpu;
 
     cpus_and(target, *mask, cpuidle_mwait_flags);
 
-    /* cpu is 'mwait'ing at softirq_pending,
-       writing to it will wake up CPU */
+    /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
     for_each_cpu_mask(cpu, target)
-        set_bit(TIMER_SOFTIRQ, &softirq_pending(cpu));
+        per_cpu(cpuidle_mwait_wakeup, cpu) = 0;
 
-    cpus_andnot(*mask, *mask, cpuidle_mwait_flags);
+    cpus_andnot(*mask, *mask, target);
 }
 
 static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
     int cpu = smp_processor_id();
 
-    __monitor((void *)&softirq_pending(cpu), 0, 0);
+    __monitor((void *)&this_cpu(cpuidle_mwait_wakeup), 0, 0);
+    smp_mb();
 
-    smp_mb();
-    if (!softirq_pending(cpu))
+    /*
+     * Timer deadline passing is the event on which we will be woken via
+     * cpuidle_mwait_wakeup. So check it now that the location is armed.
+     */
+    if ( per_cpu(timer_deadline_start, cpu) > NOW() )
     {
         cpu_set(cpu, cpuidle_mwait_flags);
-
         __mwait(eax, ecx);
-
         cpu_clear(cpu, cpuidle_mwait_flags);
     }
+
+    if ( per_cpu(timer_deadline_start, cpu) <= NOW() )
+        raise_softirq(TIMER_SOFTIRQ);
 }
 
 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-22  8:19                         ` Keir Fraser
@ 2010-04-22  8:23                           ` Keir Fraser
  2010-04-29 11:08                             ` Wei, Gang
  0 siblings, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-22  8:23 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 22/04/2010 09:19, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

>> Hm, let me see if I can come up with a patch for this and post it for you.
> 
> How about the attached patch? It MWAITs on a completely new flag field,
> avoiding the IPI-avoidance semantics of softirq_pending. It also folds in
> your patch. It also does wakeup-waiting checks on timer_deadline_start, that
> being the field that initiates wakeup via the MONITORed memory region.

...If you do think it looks okay, could you also test it out on relevant
hardware? :-)

 Thanks,
 Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-22  8:23                           ` Keir Fraser
@ 2010-04-29 11:08                             ` Wei, Gang
  0 siblings, 0 replies; 19+ messages in thread
From: Wei, Gang @ 2010-04-29 11:08 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 813 bytes --]

On Thursday, 2010-4-22 4:23 PM, Keir Fraser wrote:
> On 22/04/2010 09:19, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
>> How about the attached patch? It MWAITs on a completely new flag
>> field, avoiding the IPI-avoidance semantics of softirq_pending. It
>> also folds in your patch. It also does wakeup-waiting checks on
>> timer_deadline_start, that being the field that initiates wakeup via
>> the MONITORed memory region. 
> 
> ...If you do think it looks okay, could you also test it out on
> relevant hardware? :-)

Did some modification to this patch -- move the per_cpu mait_wakeup flag into irq_cpustat_t to make it __cacheline_aligned, and add check for timer_deadline_start==0 (means no timer in queue, it took me quite a lot time to find out it is necessary) case. Tested ok.

Jimmy

[-- Attachment #2: 00-mwait-v2.patch --]
[-- Type: application/octet-stream, Size: 3318 bytes --]

CPUIDLE: re-implement mwait wakeup process

It MWAITs on a completely new flag field, avoiding the IPI-avoidance
semantics of softirq_pending. It also does wakeup-waiting checks on
timer_deadline_start, that being the field that initiates wakeup via
the MONITORed memory region. 

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r 6e72f5485836 xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c	Thu Apr 29 16:49:54 2010 +0800
+++ b/xen/arch/x86/acpi/cpu_idle.c	Thu Apr 29 18:35:38 2010 +0800
@@ -153,40 +153,45 @@ static void acpi_safe_halt(void)
 /*
  * The bit is set iff cpu use monitor/mwait to enter C state
  * with this flag set, CPU can be waken up from C state
- * by writing to specific memory address, instead of sending IPI
+ * by writing to specific memory address, instead of sending an IPI.
  */
 static cpumask_t cpuidle_mwait_flags;
 
 void cpuidle_wakeup_mwait(cpumask_t *mask)
 {
     cpumask_t target;
-    int cpu;
+    unsigned int cpu;
 
     cpus_and(target, *mask, cpuidle_mwait_flags);
 
-    /* cpu is 'mwait'ing at softirq_pending,
-       writing to it will wake up CPU */
+    /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
     for_each_cpu_mask(cpu, target)
-        set_bit(TIMER_SOFTIRQ, &softirq_pending(cpu));
-
-    cpus_andnot(*mask, *mask, cpuidle_mwait_flags);
+        mwait_wakeup(cpu) = 0;
+
+    cpus_andnot(*mask, *mask, target);
 }
 
 static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
-    int cpu = smp_processor_id();
-
-    __monitor((void *)&softirq_pending(cpu), 0, 0);
-
+    unsigned int cpu = smp_processor_id();
+    s_time_t expires = per_cpu(timer_deadline_start, cpu);
+
+    __monitor((void *)&mwait_wakeup(cpu), 0, 0);
     smp_mb();
-    if (!softirq_pending(cpu))
+
+    /*
+     * Timer deadline passing is the event on which we will be woken via
+     * cpuidle_mwait_wakeup. So check it now that the location is armed.
+     */
+    if ( expires > NOW() || expires == 0 )
     {
         cpu_set(cpu, cpuidle_mwait_flags);
-
         __mwait(eax, ecx);
-
         cpu_clear(cpu, cpuidle_mwait_flags);
     }
+
+    if ( expires <= NOW() && expires > 0 )
+        raise_softirq(TIMER_SOFTIRQ);
 }
 
 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
diff -r 6e72f5485836 xen/include/asm-x86/hardirq.h
--- a/xen/include/asm-x86/hardirq.h	Thu Apr 29 16:49:54 2010 +0800
+++ b/xen/include/asm-x86/hardirq.h	Thu Apr 29 16:49:56 2010 +0800
@@ -8,6 +8,7 @@ typedef struct {
 	unsigned long __softirq_pending;
 	unsigned int __local_irq_count;
 	unsigned int __nmi_count;
+	bool_t __mwait_wakeup;
 } __cacheline_aligned irq_cpustat_t;
 
 #include <xen/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
diff -r 6e72f5485836 xen/include/xen/irq_cpustat.h
--- a/xen/include/xen/irq_cpustat.h	Thu Apr 29 16:49:54 2010 +0800
+++ b/xen/include/xen/irq_cpustat.h	Thu Apr 29 16:49:56 2010 +0800
@@ -26,5 +26,6 @@ extern irq_cpustat_t irq_stat[];
 #define softirq_pending(cpu)	__IRQ_STAT((cpu), __softirq_pending)
 #define local_irq_count(cpu)	__IRQ_STAT((cpu), __local_irq_count)
 #define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)
+#define mwait_wakeup(cpu)	__IRQ_STAT((cpu), __mwait_wakeup)
 
 #endif	/* __irq_cpustat_h */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-20 16:05       ` Wei, Gang
  2010-04-21  8:09         ` Keir Fraser
@ 2010-04-22  8:21         ` Keir Fraser
  2010-04-29 11:14           ` Wei, Gang
  1 sibling, 1 reply; 19+ messages in thread
From: Keir Fraser @ 2010-04-22  8:21 UTC (permalink / raw)
  To: Wei, Gang, xen-devel@lists.xensource.com

On 20/04/2010 17:05, "Wei, Gang" <gang.wei@intel.com> wrote:

> Resend.
> 
> CPUIDLE: shorten hpet spin_lock holding time
> 
> Try to reduce spin_lock overhead for deep C state entry/exit. This will
> benefit systems with a lot of cpus which need the hpet broadcast to wakeup
> from deep C state.
> 
> Signed-off-by: Wei Gang <gang.wei@intel.com>

I was going to ask: do you still get the decent power-saving win from this
new version of the patch? You acquire/release the lock a lot more
potentially in this version, since you do so inside the loop over cpumask in
handle_hpet_broadcast().

 -- Keir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time
  2010-04-22  8:21         ` Keir Fraser
@ 2010-04-29 11:14           ` Wei, Gang
  0 siblings, 0 replies; 19+ messages in thread
From: Wei, Gang @ 2010-04-29 11:14 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com

On Thursday, 2010-4-22 4:21 PM, Keir Fraser wrote:
> On 20/04/2010 17:05, "Wei, Gang" <gang.wei@intel.com> wrote:
> 
>> Resend.
>> 
>> CPUIDLE: shorten hpet spin_lock holding time
>> 
>> Try to reduce spin_lock overhead for deep C state entry/exit. This
>> will benefit systems with a lot of cpus which need the hpet
>> broadcast to wakeup from deep C state. 
>> 
>> Signed-off-by: Wei Gang <gang.wei@intel.com>
> 
> I was going to ask: do you still get the decent power-saving win from
> this new version of the patch? You acquire/release the lock a lot more
> potentially in this version, since you do so inside the loop over
> cpumask in handle_hpet_broadcast().

Yes, I tested it and could still get the decent power-saving win from it. And it should not have racing issue after the mwait-wakeup re-implementation patch.

Jimmy

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2010-04-29 11:14 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-20  5:39 [PATCH] CPUIDLE: shorten hpet spin_lock holding time Wei, Gang
2010-04-20 12:49 ` Keir Fraser
2010-04-20 14:04   ` Wei, Gang
2010-04-20 14:21     ` Keir Fraser
2010-04-20 15:20       ` Wei, Gang
2010-04-20 16:05       ` Wei, Gang
2010-04-21  8:09         ` Keir Fraser
2010-04-21  9:06           ` Wei, Gang
2010-04-21  9:25             ` Keir Fraser
2010-04-21  9:36               ` Wei, Gang
2010-04-21  9:52                 ` Keir Fraser
2010-04-21 10:03                   ` Keir Fraser
2010-04-22  3:59                     ` Wei, Gang
2010-04-22  7:22                       ` Keir Fraser
2010-04-22  8:19                         ` Keir Fraser
2010-04-22  8:23                           ` Keir Fraser
2010-04-29 11:08                             ` Wei, Gang
2010-04-22  8:21         ` Keir Fraser
2010-04-29 11:14           ` Wei, Gang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).