[PATCH] a patch to fix the cpu-offline-online problem caused by pm

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
@ 2011-01-24  7:34 Luming Yu
  2011-01-24 18:41 ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Luming Yu @ 2011-01-24  7:34 UTC (permalink / raw)
  To: LKML; +Cc: Len Brown

[-- Attachment #1: Type: text/plain, Size: 1653 bytes --]

Hi there,

I've seen many problems caused by deep-c-state-capable pm_idle on a
NHM-EX system with this test script
# for i in `seq 1 1000`; do echo $i; echo 0 >
/sys/devices/system/cpu/cpu59/online ; sleep 1; echo 1 >
/sys/devices/system/cpu/cpu59/online; done

As the bug " CPU online/offline causes system slowdown"
https://bugzilla.redhat.com/show_bug.cgi?id=586551 described.

The simplest and easiest and cleanest way I can think of now is as the
patch attached.

Signed-off-by: Yu Luming <luming.yu@intel.com>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c75fcdd..d419eb3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -135,6 +135,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)

 #ifdef CONFIG_HOTPLUG_CPU

+static void (*pm_idle_saved) (void) __read_mostly;
 EXPORT_SYMBOL(register_cpu_notifier);

 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -145,6 +146,19 @@ void __ref unregister_cpu_notifier(struct
notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);

+static inline void save_pm_idle(void)
+{
+	pm_idle_saved = pm_idle;
+	pm_idle = default_idle;
+	cpu_idle_wait();
+}
+
+static inline void restore_pm_idle(void)
+{
+	pm_idle = pm_idle_saved;
+	cpu_idle_wait();
+}
+
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
@@ -278,7 +292,9 @@ int __ref cpu_down(unsigned int cpu)
 		goto out;
 	}

+	save_pm_idle();
 	err = _cpu_down(cpu, 0);
+	restore_pm_idle();

 out:
 	cpu_maps_update_done();
@@ -376,7 +392,9 @@ int __cpuinit cpu_up(unsigned int cpu)
 		goto out;
 	}

+	save_pm_idle();
 	err = _cpu_up(cpu, 0);
+	restore_pm_idle();

 out:
 	cpu_maps_update_done();

[-- Attachment #2: bz586551-using-default-idle.patch --]
[-- Type: application/octet-stream, Size: 1144 bytes --]

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c75fcdd..d419eb3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -135,6 +135,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+static void (*pm_idle_saved) (void) __read_mostly;
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -145,6 +146,19 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
+static inline void save_pm_idle(void)
+{
+	pm_idle_saved = pm_idle;
+	pm_idle = default_idle;
+	cpu_idle_wait();
+}
+
+static inline void restore_pm_idle(void)
+{
+	pm_idle = pm_idle_saved;
+	cpu_idle_wait();
+}
+
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
@@ -278,7 +292,9 @@ int __ref cpu_down(unsigned int cpu)
 		goto out;
 	}
 
+	save_pm_idle();
 	err = _cpu_down(cpu, 0);
+	restore_pm_idle();
 
 out:
 	cpu_maps_update_done();
@@ -376,7 +392,9 @@ int __cpuinit cpu_up(unsigned int cpu)
 		goto out;
 	}
 
+	save_pm_idle();
 	err = _cpu_up(cpu, 0);
+	restore_pm_idle();
 
 out:
 	cpu_maps_update_done();

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-24  7:34 [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle Luming Yu
@ 2011-01-24 18:41 ` Peter Zijlstra
  2011-01-25  1:59   ` Luming Yu
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-24 18:41 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown

On Mon, 2011-01-24 at 02:34 -0500, Luming Yu wrote:
> Hi there,
> 
> I've seen many problems caused by deep-c-state-capable pm_idle on a
> NHM-EX system with this test script
> # for i in `seq 1 1000`; do echo $i; echo 0 >
> /sys/devices/system/cpu/cpu59/online ; sleep 1; echo 1 >
> /sys/devices/system/cpu/cpu59/online; done
> 
> As the bug " CPU online/offline causes system slowdown"
> https://bugzilla.redhat.com/show_bug.cgi?id=586551 described.
> 
> The simplest and easiest and cleanest way I can think of now is as the
> patch attached.
> 
> Signed-off-by: Yu Luming <luming.yu@intel.com>
> 
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index c75fcdd..d419eb3 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -135,6 +135,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
> 
>  #ifdef CONFIG_HOTPLUG_CPU
> 
> +static void (*pm_idle_saved) (void) __read_mostly;
>  EXPORT_SYMBOL(register_cpu_notifier);
> 
>  void __ref unregister_cpu_notifier(struct notifier_block *nb)
> @@ -145,6 +146,19 @@ void __ref unregister_cpu_notifier(struct
> notifier_block *nb)
>  }
>  EXPORT_SYMBOL(unregister_cpu_notifier);
> 
> +static inline void save_pm_idle(void)
> +{
> +	pm_idle_saved = pm_idle;
> +	pm_idle = default_idle;
> +	cpu_idle_wait();
> +}
> +
> +static inline void restore_pm_idle(void)
> +{
> +	pm_idle = pm_idle_saved;
> +	cpu_idle_wait();
> +}
> +
>  static inline void check_for_tasks(int cpu)
>  {
>  	struct task_struct *p;
> @@ -278,7 +292,9 @@ int __ref cpu_down(unsigned int cpu)
>  		goto out;
>  	}
> 
> +	save_pm_idle();
>  	err = _cpu_down(cpu, 0);
> +	restore_pm_idle();
> 
>  out:
>  	cpu_maps_update_done();
> @@ -376,7 +392,9 @@ int __cpuinit cpu_up(unsigned int cpu)
>  		goto out;
>  	}
> 
> +	save_pm_idle();
>  	err = _cpu_up(cpu, 0);
> +	restore_pm_idle();
> 
>  out:
>  	cpu_maps_update_done();

Ow god this is ugly.. pm_idle should die asap, not find it way into generic code, so NAK!


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-24 18:41 ` Peter Zijlstra
@ 2011-01-25  1:59   ` Luming Yu
  2011-01-25  9:12     ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Luming Yu @ 2011-01-25  1:59 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Len Brown

On Tue, Jan 25, 2011 at 2:41 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, 2011-01-24 at 02:34 -0500, Luming Yu wrote:
>> Hi there,
>>
>> I've seen many problems caused by deep-c-state-capable pm_idle on a
>> NHM-EX system with this test script
>> # for i in `seq 1 1000`; do echo $i; echo 0 >
>> /sys/devices/system/cpu/cpu59/online ; sleep 1; echo 1 >
>> /sys/devices/system/cpu/cpu59/online; done
>>
>> As the bug " CPU online/offline causes system slowdown"
>> https://bugzilla.redhat.com/show_bug.cgi?id=586551 described.
>>
>> The simplest and easiest and cleanest way I can think of now is as the
>> patch attached.
>>
>> Signed-off-by: Yu Luming <luming.yu@intel.com>
>>
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index c75fcdd..d419eb3 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -135,6 +135,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
>>
>>  #ifdef CONFIG_HOTPLUG_CPU
>>
>> +static void (*pm_idle_saved) (void) __read_mostly;
>>  EXPORT_SYMBOL(register_cpu_notifier);
>>
>>  void __ref unregister_cpu_notifier(struct notifier_block *nb)
>> @@ -145,6 +146,19 @@ void __ref unregister_cpu_notifier(struct
>> notifier_block *nb)
>>  }
>>  EXPORT_SYMBOL(unregister_cpu_notifier);
>>
>> +static inline void save_pm_idle(void)
>> +{
>> +     pm_idle_saved = pm_idle;
>> +     pm_idle = default_idle;
>> +     cpu_idle_wait();
>> +}
>> +
>> +static inline void restore_pm_idle(void)
>> +{
>> +     pm_idle = pm_idle_saved;
>> +     cpu_idle_wait();
>> +}
>> +
>>  static inline void check_for_tasks(int cpu)
>>  {
>>       struct task_struct *p;
>> @@ -278,7 +292,9 @@ int __ref cpu_down(unsigned int cpu)
>>               goto out;
>>       }
>>
>> +     save_pm_idle();
>>       err = _cpu_down(cpu, 0);
>> +     restore_pm_idle();
>>
>>  out:
>>       cpu_maps_update_done();
>> @@ -376,7 +392,9 @@ int __cpuinit cpu_up(unsigned int cpu)
>>               goto out;
>>       }
>>
>> +     save_pm_idle();
>>       err = _cpu_up(cpu, 0);
>> +     restore_pm_idle();
>>
>>  out:
>>       cpu_maps_update_done();
>
> Ow god this is ugly.. pm_idle should die asap, not find it way into generic code, so NAK!

Without the ugly fix, we seem not able to fix the problem in short time.
Or , Are you suggesting to wrap  pm_idle or similar in some generic
code that would not disappear in foreseeable future , Or  Are you just
suggesting me don't do the stuff in kerne/cpu.c, and do it in Arch
code?
But..I bet It's not that wise to let CPU hot-plug code to interact
with PM code for all Arch, so I proposed the ugly patch in generic
code.

--Luming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-25  1:59   ` Luming Yu
@ 2011-01-25  9:12     ` Peter Zijlstra
  2011-01-26  6:42       ` Luming Yu
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-25  9:12 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown

On Mon, 2011-01-24 at 20:59 -0500, Luming Yu wrote:
> 
> > Ow god this is ugly.. pm_idle should die asap, not find it way into generic code, so NAK!
> 
> Without the ugly fix, we seem not able to fix the problem in short time.
> Or , Are you suggesting to wrap  pm_idle or similar in some generic
> code that would not disappear in foreseeable future ,

There are patches out there removing pm_idle from x86 (at least).
pm_idle is a horribly broken interface that really should die.

>  Or  Are you just
> suggesting me don't do the stuff in kerne/cpu.c, and do it in Arch
> code?

Well, as it stand only about half the architectures out there even have
a pm_idle pointer, so your patch would break the other half.

If you really need to do this, do it in arch code, but really, why is
this needed at all? The changelog failed to explain wth happens and why
this solves it.

> But..I bet It's not that wise to let CPU hot-plug code to interact
> with PM code for all Arch, so I proposed the ugly patch in generic
> code. 

Which as I mentioned breaks about half the architectures we have.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-25  9:12     ` Peter Zijlstra
@ 2011-01-26  6:42       ` Luming Yu
  2011-01-28 10:30         ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Luming Yu @ 2011-01-26  6:42 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Len Brown, H. Peter Anvin

[-- Attachment #1: Type: text/plain, Size: 2033 bytes --]

On Tue, Jan 25, 2011 at 4:12 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, 2011-01-24 at 20:59 -0500, Luming Yu wrote:
>>
>> > Ow god this is ugly.. pm_idle should die asap, not find it way into generic code, so NAK!
>>
>> Without the ugly fix, we seem not able to fix the problem in short time.
>> Or , Are you suggesting to wrap  pm_idle or similar in some generic
>> code that would not disappear in foreseeable future ,
>
> There are patches out there removing pm_idle from x86 (at least).
> pm_idle is a horribly broken interface that really should die.
>
>>  Or  Are you just
>> suggesting me don't do the stuff in kerne/cpu.c, and do it in Arch
>> code?
>
> Well, as it stand only about half the architectures out there even have
> a pm_idle pointer, so your patch would break the other half.
>
> If you really need to do this, do it in arch code, but really, why is
> this needed at all? The changelog failed to explain wth happens and why
> this solves it.

Ok, How about the new patch in the attachment?

We have seen an extremely slow system under the CPU-OFFLINE-ONLIE test
on a 4-sockets NHM-EX system.

The test case of off-line-on-line a cpu 1000 times and its performance
is dominated by IPI and ipi_handler performance. On NHM-EX, Sending
IPI not through broadcast is very slow. Needs to wake up processor by
IPI from deep-c-state also incurs heavy weight delay in set_mtrr
synchronization in stop_machinecontext. NHM-EX's c3-stop-APIC timer
adds more trouble to the problem. If I understand the problem
correctly, We probably need to tweak IPI code in upstream to get a
clean solution for NHM-EX's slow IPI delivery problem to get
reschedule tick processed without any delay on CPU which was in deep c state.
But it needs more time. So A quick fix is provided to make the test pass.

Without the patch the current CPU Office Online feature would not work
reliably, since it currently unnecessarily implicitly interact with
CPU power management.

--Luming

[-- Attachment #2: switch-idle-procedure.patch --]
[-- Type: application/octet-stream, Size: 1534 bytes --]

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 083e99d..832bbdc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -83,6 +83,7 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
 * for idle threads.
 */
 #ifdef CONFIG_HOTPLUG_CPU
+static struct notifier_block pm_idle_cpu_notifier;
 /*
  * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
  * removed after init for !CONFIG_HOTPLUG_CPU.
@@ -1162,6 +1163,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		uv_system_init();
 
 	set_mtrr_aps_delayed_init();
+	register_hotcpu_notifier(&pm_idle_cpu_notifier);
 out:
 	preempt_enable();
 }
@@ -1469,6 +1471,42 @@ void native_play_dead(void)
 	hlt_play_dead();
 }
 
+static void (*pm_idle_saved)(void);
+
+static inline void save_pm_idle(void)
+{
+	pm_idle_saved = pm_idle;
+	pm_idle = default_idle;
+	cpu_idle_wait();
+}
+
+static inline void restore_pm_idle(void)
+{
+	pm_idle = pm_idle_saved;
+	cpu_idle_wait();
+}
+
+static int pm_idle_callback(struct notifier_block *nfb,
+                               unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		save_pm_idle();
+		break;
+	case CPU_ONLINE:
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_DEAD:
+		restore_pm_idle();
+		break;
+	}
+	return NOTIFY_OK;
+}
+static struct notifier_block __refdata pm_idle_cpu_notifier = {
+        .notifier_call = pm_idle_callback,
+};
+
 #else /* ... !CONFIG_HOTPLUG_CPU */
 int native_cpu_disable(void)
 {

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-26  6:42       ` Luming Yu
@ 2011-01-28 10:30         ` Peter Zijlstra
  2011-01-29  5:44           ` Luming Yu
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-28 10:30 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

> We have seen an extremely slow system under the CPU-OFFLINE-ONLIE test
> on a 4-sockets NHM-EX system.

Slow is OK, cpu-hotplug isn't performance critical by any means.

> The test case of off-line-on-line a cpu 1000 times and its performance
> is dominated by IPI and ipi_handler performance. On NHM-EX, Sending
> IPI not through broadcast is very slow. Needs to wake up processor by
> IPI from deep-c-state also incurs heavy weight delay in set_mtrr
> synchronization in stop_machinecontext. NHM-EX's c3-stop-APIC timer
> adds more trouble to the problem. If I understand the problem
> correctly, We probably need to tweak IPI code in upstream to get a
> clean solution for NHM-EX's slow IPI delivery problem to get
> reschedule tick processed without any delay on CPU which was in deep c state.
> But it needs more time. So A quick fix is provided to make the test pass.

If its slow but working, the test is broken, I don't see a reason to do
anything to the kernel, let alone the below.

> Without the patch the current CPU Office Online feature would not work
> reliably,

But you just said it was slow, that means its reliable, just not fast.

>  since it currently unnecessarily implicitly interact with
> CPU power management.

daft statement at best, because if not for some misguided power
management purpose, what are you actually unplugging cpus for?
(misguided because unplug doesn't actually safe more power than simply
idling the cpu).

> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 083e99d..832bbdc 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -83,6 +83,7 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
>  * for idle threads.
>  */
>  #ifdef CONFIG_HOTPLUG_CPU
> +static struct notifier_block pm_idle_cpu_notifier;
>  /*
>   * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
>   * removed after init for !CONFIG_HOTPLUG_CPU.
> @@ -1162,6 +1163,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
>                 uv_system_init();
>  
>         set_mtrr_aps_delayed_init();
> +       register_hotcpu_notifier(&pm_idle_cpu_notifier);
>  out:
>         preempt_enable();
>  }
> @@ -1469,6 +1471,42 @@ void native_play_dead(void)
>         hlt_play_dead();
>  }
>  
> +static void (*pm_idle_saved)(void);
> +
> +static inline void save_pm_idle(void)
> +{
> +       pm_idle_saved = pm_idle;
> +       pm_idle = default_idle;
> +       cpu_idle_wait();
> +}
> +
> +static inline void restore_pm_idle(void)
> +{
> +       pm_idle = pm_idle_saved;
> +       cpu_idle_wait();
> +}

So you flip the pm_idle pointer protected unter hotplug mutex, but
that's not serialized against module loading, so what happens if you
concurrently load a module that sets another idle policy?

Your changelog is vague at best, so what exactly is the purpose here? We
flip to default_idle(), which uses HLT, which is C1. Then you run
cpu_idle_wait(), which will IPI all cpus, all these CPUs (except one)
could have been in deep C states (C3+) so you get your slow wakeup
anyway. 

There-after you do the normal stop-machine hot-plug dance, which again
will IPI all cpus once, then you flip it back to the saved pm_idle
handler and again IPI all cpus.

You do this unconditionally, so:
  - you don't avoid a slow C3 wakup
  - you make a hotplug do 3 ipi round-trips instead of 1

Could you please explain wtf you're doing any why?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-28 10:30         ` Peter Zijlstra
@ 2011-01-29  5:44           ` Luming Yu
  2011-01-30 16:36             ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Luming Yu @ 2011-01-29  5:44 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Fri, Jan 28, 2011 at 6:30 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>> We have seen an extremely slow system under the CPU-OFFLINE-ONLIE test
>> on a 4-sockets NHM-EX system.
>
> Slow is OK, cpu-hotplug isn't performance critical by any means.

Here is one example that the "slow" is not acceptable. Maybe I should
not use "slow" in the first place. It happnes after I resolved a
similar NMI watchdog warnning in calibrate_delay_direct..

Please note, I got the BUG in a 2.6.32-based kernel. Upstream  behaves
similar I guess.

BUG: soft lockup - CPU#63 stuck for 61s! [migration/63:256]
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table ipv6 dm_mirror dm_region_hash dm_log i2c_i801 i2c_core
iTCO_wdt iTCO_vendor_support ioatdma i7core_edac edac_core sg igb dca
ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic
ata_piix megaraid_sas dm_mod [last unloaded: microcode]
CPU 63:
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table ipv6 dm_mirror dm_region_hash dm_log i2c_i801 i2c_core
iTCO_wdt iTCO_vendor_support ioatdma i7core_edac edac_core sg igb dca
ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic
ata_piix megaraid_sas dm_mod [last unloaded: microcode]
Pid: 256, comm: migration/63 Not tainted 2.6.32 #13 QSSC-S4R
RIP: 0010:[<ffffffff81022120>]  [<ffffffff81022120>] mtrr_work_handler+0x20/0xc0
RSP: 0018:ffff88046d997de0  EFLAGS: 00000246
RAX: 0000000000000000 RBX: ffff88046d997df0 RCX: ffff880c8e5f2168
RDX: ffff88046d995520 RSI: 00000000ffffffff RDI: ffff88106dc1dea8
RBP: ffffffff8100bc8e R08: ffff88046d996000 R09: 00000000ffffffff
R10: 00000000ffffffff R11: 0000000000000001 R12: ffff88046d997df0
R13: ffffffff8100bc8e R14: 0000000000000000 R15: ffffffff814c2676
FS:  0000000000000000(0000) GS:ffff880c8e5e0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00007f58b1761098 CR3: 0000000001a25000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Call Trace:
 [<ffffffff810be4da>] ? cpu_stopper_thread+0xda/0x1b0
 [<ffffffff814c2676>] ? thread_return+0x4e/0x778
 [<ffffffff81054792>] ? default_wake_function+0x12/0x20
 [<ffffffff810be400>] ? cpu_stopper_thread+0x0/0x1b0
 [<ffffffff81089d86>] ? kthread+0x96/0xa0
 [<ffffffff8100c1ca>] ? child_rip+0xa/0x20
 [<ffffffff81089cf0>] ? kthread+0x0/0xa0
 [<ffffffff8100c1c0>] ? child_rip+0x0/0x20
BUG: soft lockup - CPU#63 stuck for 61s! [migration/63:256]
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table ipv6 dm_mirro

>
>> The test case of off-line-on-line a cpu 1000 times and its performance
>> is dominated by IPI and ipi_handler performance. On NHM-EX, Sending
>> IPI not through broadcast is very slow. Needs to wake up processor by
>> IPI from deep-c-state also incurs heavy weight delay in set_mtrr
>> synchronization in stop_machinecontext. NHM-EX's c3-stop-APIC timer
>> adds more trouble to the problem. If I understand the problem
>> correctly, We probably need to tweak IPI code in upstream to get a
>> clean solution for NHM-EX's slow IPI delivery problem to get
>> reschedule tick processed without any delay on CPU which was in deep c state.
>> But it needs more time. So A quick fix is provided to make the test pass.
>
> If its slow but working, the test is broken, I don't see a reason to do
> anything to the kernel, let alone the below.

It not working sometimes, so I think it's not a solid feature right now.

>
>> Without the patch the current CPU Office Online feature would not work
>> reliably,
>
> But you just said it was slow, that means its reliable, just not fast.

I must have used a wrong term. Feel sorry about that.
>
>>  since it currently unnecessarily implicitly interact with
>> CPU power management.
>
> daft statement at best, because if not for some misguided power
> management purpose, what are you actually unplugging cpus for?
> (misguided because unplug doesn't actually safe more power than simply
> idling the cpu).
It's a RAS feature and Suspend Resume also hits same code path I think.
>
>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>> index 083e99d..832bbdc 100644
>> --- a/arch/x86/kernel/smpboot.c
>> +++ b/arch/x86/kernel/smpboot.c
>> @@ -83,6 +83,7 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
>>  * for idle threads.
>>  */
>>  #ifdef CONFIG_HOTPLUG_CPU
>> +static struct notifier_block pm_idle_cpu_notifier;
>>  /*
>>   * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
>>   * removed after init for !CONFIG_HOTPLUG_CPU.
>> @@ -1162,6 +1163,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
>>                 uv_system_init();
>>
>>         set_mtrr_aps_delayed_init();
>> +       register_hotcpu_notifier(&pm_idle_cpu_notifier);
>>  out:
>>         preempt_enable();
>>  }
>> @@ -1469,6 +1471,42 @@ void native_play_dead(void)
>>         hlt_play_dead();
>>  }
>>
>> +static void (*pm_idle_saved)(void);
>> +
>> +static inline void save_pm_idle(void)
>> +{
>> +       pm_idle_saved = pm_idle;
>> +       pm_idle = default_idle;
>> +       cpu_idle_wait();
>> +}
>> +
>> +static inline void restore_pm_idle(void)
>> +{
>> +       pm_idle = pm_idle_saved;
>> +       cpu_idle_wait();
>> +}
>
> So you flip the pm_idle pointer protected unter hotplug mutex, but
> that's not serialized against module loading, so what happens if you
> concurrently load a module that sets another idle policy?
>
> Your changelog is vague at best, so what exactly is the purpose here? We
> flip to default_idle(), which uses HLT, which is C1. Then you run
> cpu_idle_wait(), which will IPI all cpus, all these CPUs (except one)
> could have been in deep C states (C3+) so you get your slow wakeup
> anyway.
>
> There-after you do the normal stop-machine hot-plug dance, which again
> will IPI all cpus once, then you flip it back to the saved pm_idle
> handler and again IPI all cpus.

https://lkml.org/lkml/2009/6/29/60
it needs 50-100us latency to send one IPI, you could get an idea on a
large NHM-EX system which contains 64 logical processors. With
Tickless and APIC timer stopped in C3 on NHM-EX, you could also have
an idea about the problem I have.

Let me know if there are still questions .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-29  5:44           ` Luming Yu
@ 2011-01-30 16:36             ` Peter Zijlstra
  2011-01-31  3:26               ` Luming Yu
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-30 16:36 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Sat, 2011-01-29 at 13:44 +0800, Luming Yu wrote:
> On Fri, Jan 28, 2011 at 6:30 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> >> We have seen an extremely slow system under the CPU-OFFLINE-ONLIE test
> >> on a 4-sockets NHM-EX system.
> >
> > Slow is OK, cpu-hotplug isn't performance critical by any means.
> 
> Here is one example that the "slow" is not acceptable. Maybe I should
> not use "slow" in the first place. It happnes after I resolved a
> similar NMI watchdog warnning in calibrate_delay_direct..
> 
> Please note, I got the BUG in a 2.6.32-based kernel. Upstream  behaves
> similar I guess.

Guessing is totally the wrong thing when you're sending stuff upstream,
esp ugly patches such as this. .32 is more than a year old, anything
could have happened.

> BUG: soft lockup - CPU#63 stuck for 61s! [migration/63:256]

> > If its slow but working, the test is broken, I don't see a reason to do
> > anything to the kernel, let alone the below.
> 
> It not working sometimes, so I think it's not a solid feature right now.

But you didn't say anything about not working, you merely said slow. If
its not working, you need to very carefully explain what is not working,
where its deadlocked and how your patch solves this and how you avoid
wrecking stuff for everybody else.

> >>  since it currently unnecessarily implicitly interact with
> >> CPU power management.
> >
> > daft statement at best, because if not for some misguided power
> > management purpose, what are you actually unplugging cpus for?
> > (misguided because unplug doesn't actually safe more power than simply
> > idling the cpu).
> It's a RAS feature and Suspend Resume also hits same code path I think.

That still doesn't say anything, also who in his right mind suspends a
nhm-ex system?

> > So you flip the pm_idle pointer protected unter hotplug mutex, but
> > that's not serialized against module loading, so what happens if you
> > concurrently load a module that sets another idle policy?
> >
> > Your changelog is vague at best, so what exactly is the purpose here? We
> > flip to default_idle(), which uses HLT, which is C1. Then you run
> > cpu_idle_wait(), which will IPI all cpus, all these CPUs (except one)
> > could have been in deep C states (C3+) so you get your slow wakeup
> > anyway.
> >
> > There-after you do the normal stop-machine hot-plug dance, which again
> > will IPI all cpus once, then you flip it back to the saved pm_idle
> > handler and again IPI all cpus.
> 
> https://lkml.org/lkml/2009/6/29/60
> it needs 50-100us latency to send one IPI, you could get an idea on a
> large NHM-EX system which contains 64 logical processors. With
> Tickless and APIC timer stopped in C3 on NHM-EX, you could also have
> an idea about the problem I have.

Ok, so one IPI costs 50-100 us, even with 64 cpu, that's at most 6.4ms
nowhere near enough to trigger the NMI watchdog. So what does go wrong?

Why does your patch solve things, like said, it doesn't avoid the slow
IPI at all, you still IPI each cpu right after changing the pm_idle
function. Those IPIs will still hit C3+ states.

> Let me know if there are still questions .

Yeah, what are you smoking? Why do you wreck perfectly fine code for one
backward ass piece of hardware.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-30 16:36             ` Peter Zijlstra
@ 2011-01-31  3:26               ` Luming Yu
  2011-01-31 10:16                 ` Peter Zijlstra
  2011-01-31 10:48                 ` Peter Zijlstra
  0 siblings, 2 replies; 13+ messages in thread
From: Luming Yu @ 2011-01-31  3:26 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Mon, Jan 31, 2011 at 12:36 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Sat, 2011-01-29 at 13:44 +0800, Luming Yu wrote:
>> On Fri, Jan 28, 2011 at 6:30 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>> >> We have seen an extremely slow system under the CPU-OFFLINE-ONLIE test
>> >> on a 4-sockets NHM-EX system.
>> >
>> > Slow is OK, cpu-hotplug isn't performance critical by any means.
>>
>> Here is one example that the "slow" is not acceptable. Maybe I should
>> not use "slow" in the first place. It happnes after I resolved a
>> similar NMI watchdog warnning in calibrate_delay_direct..
>>
>> Please note, I got the BUG in a 2.6.32-based kernel. Upstream  behaves
>> similar I guess.
>
> Guessing is totally the wrong thing when you're sending stuff upstream,
> esp ugly patches such as this. .32 is more than a year old, anything
> could have happened.

Ok. the default upstream kernel seems to have NMI watchdog disabled?
since there is 0 NMI.

# cat /proc/interrupts | grep -i nmi
 NMI:          0          0          0          0          0
0          0          0          0          0          0          0
      0          0          0          0          0          0
 0          0          0          0          0          0          0
       0          0          0          0          0          0
  0          0          0          0          0          0          0
        0          0          0          0          0          0
   0          0          0          0          0          0          0
         0          0          0          0          0          0
    0          0          0          0          0          0
Non-maskable interrupts
[root@intel-s3e36-02 ~]# uptime
 21:43:27 up 34 min,  2 users,  load average: 0.40, 1.27, 2.32

But there is a problem like this:

Booting Node 3 Processor 59 APIC 0x75
Clocksource tsc unstable (delta = -77316000544 ns)
Switching to clocksource hpet


>
>> BUG: soft lockup - CPU#63 stuck for 61s! [migration/63:256]
>
>> > If its slow but working, the test is broken, I don't see a reason to do
>> > anything to the kernel, let alone the below.
>>
>> It not working sometimes, so I think it's not a solid feature right now.
>
> But you didn't say anything about not working, you merely said slow. If
> its not working, you need to very carefully explain what is not working,
> where its deadlocked and how your patch solves this and how you avoid
> wrecking stuff for everybody else.

It's not working because of NMI watchdog. If you ignore NMI watchdog,
then I guess it works but just slow..

>
>> >>  since it currently unnecessarily implicitly interact with
>> >> CPU power management.
>> >
>> > daft statement at best, because if not for some misguided power
>> > management purpose, what are you actually unplugging cpus for?
>> > (misguided because unplug doesn't actually safe more power than simply
>> > idling the cpu).
>> It's a RAS feature and Suspend Resume also hits same code path I think.
>
> That still doesn't say anything, also who in his right mind suspends a
> nhm-ex system?

But we need to have a solid code in place. We can't blame user who
could find something useful with trying that. Let hotplug code to
implicitly interact with CPU PM will complicate things unnecessarily.

>
>> > So you flip the pm_idle pointer protected unter hotplug mutex, but
>> > that's not serialized against module loading, so what happens if you
>> > concurrently load a module that sets another idle policy?
>> >
>> > Your changelog is vague at best, so what exactly is the purpose here? We
>> > flip to default_idle(), which uses HLT, which is C1. Then you run
>> > cpu_idle_wait(), which will IPI all cpus, all these CPUs (except one)
>> > could have been in deep C states (C3+) so you get your slow wakeup
>> > anyway.
>> >
>> > There-after you do the normal stop-machine hot-plug dance, which again
>> > will IPI all cpus once, then you flip it back to the saved pm_idle
>> > handler and again IPI all cpus.
>>
>> https://lkml.org/lkml/2009/6/29/60
>> it needs 50-100us latency to send one IPI, you could get an idea on a
>> large NHM-EX system which contains 64 logical processors. With
>> Tickless and APIC timer stopped in C3 on NHM-EX, you could also have
>> an idea about the problem I have.
>
> Ok, so one IPI costs 50-100 us, even with 64 cpu, that's at most 6.4ms
> nowhere near enough to trigger the NMI watchdog. So what does go wrong?

Good question!
But we also can't forget there were large latency from C3.
And I guess some reschedule ticks get lost to kick some CPUs out of
idle due to the side effects of the CPU PM feature. if use nohz=off,
everything seems to just work.
Yes, I agree we need to dig it out either.
But it's kind of combination problem between the special stop_machine
context and CPU power management...

>
> Why does your patch solve things, like said, it doesn't avoid the slow
> IPI at all, you still IPI each cpu right after changing the pm_idle
> function. Those IPIs will still hit C3+ states.
>
>> Let me know if there are still questions .
>
> Yeah, what are you smoking? Why do you wreck perfectly fine code for one
> backward ass piece of hardware.

Just make things less complex...

Thanks
Luming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-31  3:26               ` Luming Yu
@ 2011-01-31 10:16                 ` Peter Zijlstra
  2011-01-31 14:10                   ` Luming Yu
  2011-01-31 10:48                 ` Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-31 10:16 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Sun, 2011-01-30 at 22:26 -0500, Luming Yu wrote:

> > Guessing is totally the wrong thing when you're sending stuff upstream,
> > esp ugly patches such as this. .32 is more than a year old, anything
> > could have happened.
> 
> Ok. the default upstream kernel seems to have NMI watchdog disabled?

Then enable it already, its a whole CONFIG option away..

> It's not working because of NMI watchdog. If you ignore NMI watchdog,
> then I guess it works but just slow..

Don't guess, test it dammit. And then figure out why it triggers, I
haven't seen _anything_ that would cause it to trigger, nor a sane
explanation for your patch.

> > Ok, so one IPI costs 50-100 us, even with 64 cpu, that's at most 6.4ms
> > nowhere near enough to trigger the NMI watchdog. So what does go wrong?
> 
> Good question!
> But we also can't forget there were large latency from C3.

Not 60+ seconds large I hope, I know NHM-EX has some suckage, but surely
not that bad?

> And I guess some reschedule ticks get lost to kick some CPUs out of
> idle due to the side effects of the CPU PM feature. if use nohz=off,
> everything seems to just work.
> Yes, I agree we need to dig it out either.
> But it's kind of combination problem between the special stop_machine
> context and CPU power management...

Yeah, so? Also, incidentally, stop-machine got a rewrite around .35 and
again significant changes in .37, so please do test mainline and not
your dinosaur.

> > Yeah, what are you smoking? Why do you wreck perfectly fine code for one
> > backward ass piece of hardware.
> 
> Just make things less complex...

But its wrong, it very clearly works around a real problem, don't ever
do that, fix the problem!


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-31 10:16                 ` Peter Zijlstra
@ 2011-01-31 14:10                   ` Luming Yu
  2011-01-31 14:17                     ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Luming Yu @ 2011-01-31 14:10 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Mon, Jan 31, 2011 at 5:16 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Sun, 2011-01-30 at 22:26 -0500, Luming Yu wrote:
>
>> > Guessing is totally the wrong thing when you're sending stuff upstream,
>> > esp ugly patches such as this. .32 is more than a year old, anything
>> > could have happened.
>>
>> Ok. the default upstream kernel seems to have NMI watchdog disabled?
>
> Then enable it already, its a whole CONFIG option away..
>
>> It's not working because of NMI watchdog. If you ignore NMI watchdog,
>> then I guess it works but just slow..
>
> Don't guess, test it dammit. And then figure out why it triggers, I
> haven't seen _anything_ that would cause it to trigger, nor a sane
> explanation for your patch.

As what I suspected, it's reproduced with upstream git three (head is at 2.6.37)
after enabled soft LOCK UP detector kernel debug option.

 is now offline
Booting Node 3 Processor 59 APIC 0x75
NMI watchdog enabled, takes one hw-pmu counter.
CPU 59 is now offline
Booting Node 3 Processor 59 APIC 0x75
CPU59: Stuck ??
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 3
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 17, comm: migration/3 Not tainted 2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024562>] ? mtrr_work_handler+0x52/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8d5 ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 1
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 8, comm: migration/1 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024560>] ? mtrr_work_handler+0x50/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8d6 ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 63
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 304, comm: migration/63 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024562>] ? mtrr_work_handler+0x52/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8d7 ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 7
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 33, comm: migration/7 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024562>] ? mtrr_work_handler+0x52/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8d8 ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 39
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 183, comm: migration/39 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024562>] ? mtrr_work_handler+0x52/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8d9 ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 5
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 25, comm: migration/5 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024562>] ? mtrr_work_handler+0x52/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8da ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 37
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 175, comm: migration/37 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024564>] ? mtrr_work_handler+0x54/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8db ]---
------------[ cut here ]------------
WARNING: at kernel/watchdog.c:227 watchdog_overflow_callback+0xe4/0x110()
Hardware name: QSSC-S4R
Watchdog detected hard LOCKUP on cpu 11
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ipv6 dm_mirror dm_region_hash dm_log pcspkr shpchp
i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac
edac_core sg igb dca ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif
pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded:
microcode]
Pid: 49, comm: migration/11 Tainted: G        W   2.6.37 #8
Call Trace:
 <NMI>  [<ffffffff810620af>] warn_slowpath_common+0x7f/0xc0
 [<ffffffff810621a6>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff810c4cf4>] watchdog_overflow_callback+0xe4/0x110
 [<ffffffff810f6a2b>] __perf_event_overflow+0x8b/0x220
 [<ffffffff8101c763>] ? intel_pmu_save_and_restart+0x93/0xb0
 [<ffffffff810f7004>] perf_event_overflow+0x14/0x20
 [<ffffffff8101e46a>] intel_pmu_handle_irq+0x25a/0x4d0
 [<ffffffff814ada16>] ? kprobe_exceptions_notify+0x16/0x4a0
 [<ffffffff814ac3b1>] ? hw_breakpoint_exceptions_notify+0x21/0x160
 [<ffffffff814ac548>] perf_event_nmi_handler+0x58/0xf0
 [<ffffffff814ae935>] notifier_call_chain+0x55/0x80
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff814ae99a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff814ae9ce>] notify_die+0x2e/0x30
 [<ffffffff814abba3>] do_nmi+0x173/0x2b0
 [<ffffffff814ab460>] nmi+0x20/0x30
 [<ffffffff81024510>] ? mtrr_work_handler+0x0/0xd0
 [<ffffffff81024564>] ? mtrr_work_handler+0x54/0xd0
 <<EOE>>  [<ffffffff810b5ff2>] cpu_stopper_thread+0xf2/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff810b5f00>] ? cpu_stopper_thread+0x0/0x1d0
 [<ffffffff81083356>] kthread+0x96/0xa0
 [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
 [<ffffffff810832c0>] ? kthread+0x0/0xa0
 [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
---[ end trace d2115ecb4672c8dc ]---
------------[ cut here ]------------


>
>> > Ok, so one IPI costs 50-100 us, even with 64 cpu, that's at most 6.4ms
>> > nowhere near enough to trigger the NMI watchdog. So what does go wrong?
>>
>> Good question!
>> But we also can't forget there were large latency from C3.
>
> Not 60+ seconds large I hope, I know NHM-EX has some suckage, but surely
> not that bad?

I guess the side effects of the large latency could have confused high
resolution timer code, which could have caused some reschedule ticks
lost. So we can't just directly multiply  those 100 or 200 us latency
with 64 to  calculate any suckage.

>
>> And I guess some reschedule ticks get lost to kick some CPUs out of
>> idle due to the side effects of the CPU PM feature. if use nohz=off,
>> everything seems to just work.
>> Yes, I agree we need to dig it out either.
>> But it's kind of combination problem between the special stop_machine
>> context and CPU power management...
>
> Yeah, so? Also, incidentally, stop-machine got a rewrite around .35 and
> again significant changes in .37, so please do test mainline and not
> your dinosaur.

With a .37 kernel, I've reproduced almost same problem as my .32-based kernel

>
>> > Yeah, what are you smoking? Why do you wreck perfectly fine code for one
>> > backward ass piece of hardware.
>>
>> Just make things less complex...
>
> But its wrong, it very clearly works around a real problem, don't ever
> do that, fix the problem!
>
My understanding is if the heart of the problem is triggered by some
hardware defects.
And we could have no other clean option than a solution I proposed
here. As long as the defects would not affect kernel hot path, I think
it should be fine to save some unnecessary complexity .

Yes, I agree before say yes to a "workaround", we need to understand
exactly what those side effects are. I will try to do research on all
side effects caused by the ipi and c3 latency to tickless and highres
kernel after one week vacation starting from Feb 1st.

--Luming

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-31 14:10                   ` Luming Yu
@ 2011-01-31 14:17                     ` Peter Zijlstra
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-31 14:17 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Mon, 2011-01-31 at 09:10 -0500, Luming Yu wrote:
> 
> My understanding is if the heart of the problem is triggered by some
> hardware defects.
> And we could have no other clean option than a solution I proposed
> here. 

Well, a cleaner option is to limit NHM-EX hardware to C1 and simply
dis-allow anything deeper on it.

> Yes, I agree before say yes to a "workaround", we need to understand
> exactly what those side effects are. I will try to do research on all
> side effects caused by the ipi and c3 latency to tickless and highres
> kernel after one week vacation starting from Feb 1st.
> 
OK, great! and enjoy your time away from the computer ;-)


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle
  2011-01-31  3:26               ` Luming Yu
  2011-01-31 10:16                 ` Peter Zijlstra
@ 2011-01-31 10:48                 ` Peter Zijlstra
  1 sibling, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2011-01-31 10:48 UTC (permalink / raw)
  To: Luming Yu; +Cc: LKML, Len Brown, H. Peter Anvin, tglx

On Sun, 2011-01-30 at 22:26 -0500, Luming Yu wrote:
> 
> Booting Node 3 Processor 59 APIC 0x75
> Clocksource tsc unstable (delta = -77316000544 ns)
> Switching to clocksource hpet 

I think that's known NHM-EX suckage, it does all kinds of stupid things
in deeper C states that other NHM/WSM chips don't do.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-01-31 14:16 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-01-24  7:34 [PATCH] a patch to fix the cpu-offline-online problem caused by pm_idle Luming Yu
2011-01-24 18:41 ` Peter Zijlstra
2011-01-25  1:59   ` Luming Yu
2011-01-25  9:12     ` Peter Zijlstra
2011-01-26  6:42       ` Luming Yu
2011-01-28 10:30         ` Peter Zijlstra
2011-01-29  5:44           ` Luming Yu
2011-01-30 16:36             ` Peter Zijlstra
2011-01-31  3:26               ` Luming Yu
2011-01-31 10:16                 ` Peter Zijlstra
2011-01-31 14:10                   ` Luming Yu
2011-01-31 14:17                     ` Peter Zijlstra
2011-01-31 10:48                 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox