All of lore.kernel.org
 help / color / mirror / Atom feed
From: "H. Peter Anvin" <hpa@zytor.com>
To: Borislav Petkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Mike Galbraith <bitbucket@online.de>,
	Thomas Gleixner <tglx@linutronix.de>, Len Brown <lenb@kernel.org>,
	Linux PM list <linux-pm@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Jeremy Eder <jeder@redhat.com>,
	x86@kernel.org
Subject: Re: 50 Watt idle power regression bisected to Linux-3.10
Date: Wed, 11 Dec 2013 16:52:43 -0800	[thread overview]
Message-ID: <52A908DB.60902@zytor.com> (raw)
In-Reply-To: <20131211231425.GD8863@pd.tnic>

[-- Attachment #1: Type: text/plain, Size: 824 bytes --]

On 12/11/2013 03:14 PM, Borislav Petkov wrote:
> On Wed, Dec 11, 2013 at 03:08:35PM -0800, H. Peter Anvin wrote:
>> So I would like to propose that we switch to using a percpu variable
>> which is a single cache line of nothing at all. It would only ever
>> be touched by MONITOR and for explicit wakeup. Hopefully that will
>> resolve this problem without the need for the CLFLUSH.
> 
> Yep, makes a lot of sense to me to have an exclusive (overloaded meaning
> here :-)) cacheline only for that. And, if it works, we'll save us the
> penalty from the CLFLUSH too, cool.
> 

Here is a POC patch... anyone willing to test it out?

Two obvious things to watch out for:

1. I couldn't actually spot any obvious cases of a deliberate monitor
   trigger.

2. Should we do cpu_relax() for all users, not just powerclamp?

	-hpa


[-- Attachment #2: 0001-x86-mwait-Use-a-dedicated-percpu-area-for-mwait-door.patch --]
[-- Type: text/x-patch, Size: 9373 bytes --]

>From 20a54d54ea06f050650ab8923b7d9ee1d21b5317 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 11 Dec 2013 16:31:04 -0800
Subject: [PATCH] x86, mwait: Use a dedicated percpu area for mwait doorbell

We have used the cache line that includes thread_info.flags as the
mwait doorbell.  However, this is liable to be dirty in the cache,
which may trigger hardware errata, plus it might cause false wakeups.
Instead, use a dedicated wakeup doorbell area.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h  |  1 -
 arch/x86/include/asm/mwait.h       | 46 ++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h   | 23 -------------------
 arch/x86/kernel/acpi/cstate.c      |  6 +----
 arch/x86/kernel/cpu/common.c       | 17 ++++++++++++++
 arch/x86/kernel/cpu/intel.c        |  3 ---
 arch/x86/kernel/setup_percpu.c     |  3 +++
 arch/x86/kernel/smpboot.c          | 19 +---------------
 drivers/acpi/acpi_pad.c            |  3 +--
 drivers/idle/intel_idle.c          |  4 +---
 drivers/thermal/intel_powerclamp.c |  2 +-
 11 files changed, 71 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e099f95..cdc77f3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -96,7 +96,6 @@
 #define X86_FEATURE_XTOPOLOGY	(3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	(3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
 #define X86_FEATURE_EXTD_APICID	(3*32+26) /* has extended APICID (8 bits) */
 #define X86_FEATURE_AMD_DCM     (3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 2f366d0..4c82863 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -13,4 +13,50 @@
 
 #define MWAIT_ECX_INTERRUPT_BREAK	0x1
 
+#ifndef __ASSEMBLY__
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+			     unsigned long edx)
+{
+	/* "monitor %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc8;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+	trace_hardirqs_on();
+	/* "mwait %eax, %ecx;" */
+	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+}
+
+extern char __percpu *mwait_doorbell;
+
+void __init setup_mwait_doorbell(void);
+
+static inline void x86_monitor_doorbell(unsigned long ecx, unsigned long edx)
+{
+	__monitor(__this_cpu_ptr(mwait_doorbell), ecx, edx);
+	mb();
+}
+
+static inline void x86_mwait_doorbell_bing_bong(int cpu)
+{
+	ACCESS_ONCE(*per_cpu_ptr(mwait_doorbell, cpu)) = 0;
+}
+
+#endif /* !__ASSEMBLY__ */
+
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b7845a1..a51a79e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -723,29 +723,6 @@ static inline void sync_core(void)
 #endif
 }
 
-static inline void __monitor(const void *eax, unsigned long ecx,
-			     unsigned long edx)
-{
-	/* "monitor %eax, %ecx, %edx;" */
-	asm volatile(".byte 0x0f, 0x01, 0xc8;"
-		     :: "a" (eax), "c" (ecx), "d"(edx));
-}
-
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-	/* "mwait %eax, %ecx;" */
-	asm volatile(".byte 0x0f, 0x01, 0xc9;"
-		     :: "a" (eax), "c" (ecx));
-}
-
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
-{
-	trace_hardirqs_on();
-	/* "mwait %eax, %ecx;" */
-	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
-		     :: "a" (eax), "c" (ecx));
-}
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void init_amd_e400_c1e_mask(void);
 
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27..aec26c5 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -163,11 +163,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
-		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-			clflush((void *)&current_thread_info()->flags);
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		smp_mb();
+		x86_monitor_doorbell(0, 0);
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6abc172..b6b19ab 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -24,6 +24,7 @@
 #include <linux/cpumask.h>
 #include <asm/pgtable.h>
 #include <linux/atomic.h>
+#include <asm/mwait.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
@@ -63,6 +64,22 @@ void __init setup_cpu_local_masks(void)
 	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
 }
 
+/* allocate percpu area for mwait doorbell */
+char __percpu *mwait_doorbell;
+
+void __init setup_mwait_doorbell(void)
+{
+	if (boot_cpu_has(X86_FEATURE_MWAIT)) {
+		mwait_doorbell = __alloc_percpu(boot_cpu_data.clflush_size,
+						boot_cpu_data.clflush_size);
+
+		if (!mwait_doorbell) {
+			/* This should never happen... */
+			panic("Unable to allocate mwait doorbell!\n");
+		}
+	}
+}
+
 static void default_init(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dc1ec0d..47b4e7b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -387,9 +387,6 @@ static void init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_PEBS);
 	}
 
-	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
-		set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
-
 #ifdef CONFIG_X86_64
 	if (c->x86 == 15)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5cdff03..917e1af 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -284,4 +284,7 @@ void __init setup_per_cpu_areas(void)
 
 	/* Setup cpu initialized, callin, callout masks */
 	setup_cpu_local_masks();
+
+	/* Setup MWAIT doorbell */
+	setup_mwait_doorbell();
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a..558e097 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1368,7 +1368,6 @@ static inline void mwait_play_dead(void)
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int highest_cstate = 0;
 	unsigned int highest_subcstate = 0;
-	void *mwait_ptr;
 	int i;
 
 	if (!this_cpu_has(X86_FEATURE_MWAIT))
@@ -1400,26 +1399,10 @@ static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
-	/*
-	 * This should be a memory location in a cache line which is
-	 * unlikely to be touched by other processors.  The actual
-	 * content is immaterial as it is not actually modified in any way.
-	 */
-	mwait_ptr = &current_thread_info()->flags;
-
 	wbinvd();
 
 	while (1) {
-		/*
-		 * The CLFLUSH is a workaround for erratum AAI65 for
-		 * the Xeon 7400 series.  It's not clear it is actually
-		 * needed, but it should be harmless in either case.
-		 * The WBINVD is insufficient due to the spurious-wakeup
-		 * case where we return around the loop.
-		 */
-		clflush(mwait_ptr);
-		__monitor(mwait_ptr, 0, 0);
-		mb();
+		x86_monitor_doorbell(0, 0);
 		__mwait(eax, 0);
 		/*
 		 * If NMI wants to wake up CPU0, start CPU0.
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index fc6008f..38aaa8b 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -193,8 +193,7 @@ static int power_saving_thread(void *data)
 					CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
 			stop_critical_timings();
 
-			__monitor((void *)&current_thread_info()->flags, 0, 0);
-			smp_mb();
+			x86_monitor_doorbell(0, 0);
 			if (!need_resched())
 				__mwait(power_saving_mwait_eax, 1);
 
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 92d1206..6fef6d9 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -376,9 +376,7 @@ static int intel_idle(struct cpuidle_device *dev,
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
 
 	if (!current_set_polling_and_test()) {
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		smp_mb();
+		x86_monitor_doorbell(0, 0);
 		if (!need_resched())
 			__mwait(eax, ecx);
 	}
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 8f181b3..15cf013 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -438,7 +438,7 @@ static int clamp_thread(void *arg)
 			 */
 			local_touch_nmi();
 			stop_critical_timings();
-			__monitor((void *)&current_thread_info()->flags, 0, 0);
+			x86_monitor_doorbell(0, 0);
 			cpu_relax(); /* allow HT sibling to run */
 			__mwait(eax, ecx);
 			start_critical_timings();
-- 
1.8.3.1


  reply	other threads:[~2013-12-12  0:52 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-07  8:00 50 Watt idle power regression bisected to Linux-3.10 Len Brown
2013-12-07  8:39 ` Mike Galbraith
2013-12-07 16:01   ` Len Brown
2013-12-07 16:45     ` Len Brown
2013-12-07 19:17       ` Mike Galbraith
2013-12-10 11:41         ` Ingo Molnar
2013-12-07 12:54 ` Thomas Gleixner
2013-12-08  4:57 ` Mike Galbraith
2013-12-08 20:40   ` Len Brown
2013-12-09  3:16     ` Mike Galbraith
2013-12-10  5:17       ` Mike Galbraith
2013-12-10 11:45         ` Ingo Molnar
2013-12-10 14:29         ` Thomas Gleixner
2013-12-10 15:06           ` Ingo Molnar
2013-12-11  2:05           ` Thomas Gleixner
2013-12-11  3:21             ` Mike Galbraith
2013-12-11 11:28               ` Thomas Gleixner
2013-12-11 11:38                 ` Borislav Petkov
2013-12-11 11:52                   ` Peter Zijlstra
2013-12-11 12:29                     ` Mike Galbraith
2013-12-11 12:43                       ` Peter Zijlstra
2013-12-11 13:10                         ` Mike Galbraith
2013-12-11 13:40                         ` Borislav Petkov
2013-12-11 14:56                           ` Ingo Molnar
2013-12-11 16:02                             ` Borislav Petkov
2013-12-11 16:43                             ` Peter Zijlstra
2013-12-11 17:50                               ` Ingo Molnar
2013-12-11 23:08                                 ` H. Peter Anvin
2013-12-11 23:14                                   ` Borislav Petkov
2013-12-12  0:52                                     ` H. Peter Anvin [this message]
2013-12-12  4:25                                       ` Mike Galbraith
2013-12-12  4:49                                         ` H. Peter Anvin
2013-12-12  4:59                                           ` Mike Galbraith
2013-12-12  5:37                                           ` Mike Galbraith
2013-12-12  5:45                                             ` H. Peter Anvin
2013-12-12  5:57                                               ` Mike Galbraith
2013-12-12  6:05                                                 ` Mike Galbraith
2013-12-12  7:57                                                   ` H. Peter Anvin
2013-12-12  8:51                                   ` Peter Zijlstra
2013-12-12 13:28                                     ` Ingo Molnar
2013-12-12 15:06                                       ` H. Peter Anvin
2013-12-12 15:51                                         ` Peter Zijlstra
2013-12-11 14:42                         ` Ingo Molnar
2013-12-11 15:02                           ` Thomas Gleixner
2013-12-11 15:09                             ` Ingo Molnar
2013-12-11 16:44                               ` Peter Zijlstra
2013-12-11 17:48                                 ` Ingo Molnar
2013-12-11 16:44                           ` Peter Zijlstra
2013-12-11 17:47                             ` Ingo Molnar
2013-12-11 21:43                     ` Len Brown
2013-12-11 22:22                       ` Thomas Gleixner
2013-12-18 21:44 ` [PATCH] x86 idle: repair large-server 50-watt idle-power regression Len Brown
2013-12-18 21:44   ` Len Brown
2013-12-19 12:22   ` Ingo Molnar
2013-12-19 14:40     ` H. Peter Anvin
2013-12-19 15:45       ` Borislav Petkov
2013-12-19 15:55     ` H. Peter Anvin
2013-12-19 16:02       ` Ingo Molnar
2013-12-19 16:09         ` H. Peter Anvin
2013-12-19 16:13         ` H. Peter Anvin
2013-12-19 16:21           ` Peter Zijlstra
2013-12-19 16:50             ` H. Peter Anvin
2013-12-19 17:07               ` Ingo Molnar
2013-12-19 17:25                 ` Peter Zijlstra
2013-12-19 17:36                   ` Peter Zijlstra
2013-12-19 18:05                     ` H. Peter Anvin
2013-12-19 18:14                       ` Ingo Molnar
2013-12-19 17:50                   ` Peter Zijlstra
2013-12-19 18:18                     ` Ingo Molnar
2013-12-19 21:05                       ` H. Peter Anvin
2013-12-19 21:17                         ` Ingo Molnar
2013-12-19 18:10                   ` Ingo Molnar
2013-12-19 18:09                 ` H. Peter Anvin
2013-12-19 18:19                   ` H. Peter Anvin
2013-12-19 18:23                     ` Ingo Molnar
     [not found]                       ` <CA+55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com>
2013-12-19 18:43                         ` Ingo Molnar
2013-12-19 18:43                           ` Ingo Molnar
2013-12-19 20:09                         ` [tip:x86/idle] x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers tip-bot for H. Peter Anvin
2013-12-19 20:40                           ` Ingo Molnar
2013-12-19 20:46                             ` Linus Torvalds
2013-12-19 21:14                               ` Ingo Molnar
2013-12-19 21:25                                 ` Linus Torvalds
2013-12-19 21:55                             ` Peter Zijlstra
2013-12-20  8:47                               ` Ingo Molnar
2013-12-19 20:33                         ` [tip:x86/idle] x86, idle: Add memory barriers around clflush in mwait_play_dead() tip-bot for H. Peter Anvin
2013-12-19 18:19                   ` [PATCH] x86 idle: repair large-server 50-watt idle-power regression Ingo Molnar
2013-12-19 19:22                     ` H. Peter Anvin
2013-12-19 19:27                       ` Peter Zijlstra
2013-12-19 19:51   ` [tip:x86/urgent] x86 idle: Repair " tip-bot for Len Brown
2014-03-18  0:20     ` Davidlohr Bueso
2014-03-18  9:16       ` Peter Zijlstra
2014-03-19  2:14         ` Jason Low
2014-03-19  6:42           ` Peter Zijlstra
2014-04-08 21:43       ` Brown, Len
2014-04-09  8:18         ` Peter Zijlstra
2014-04-15  3:27         ` Davidlohr Bueso

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52A908DB.60902@zytor.com \
    --to=hpa@zytor.com \
    --cc=bitbucket@online.de \
    --cc=bp@alien8.de \
    --cc=jeder@redhat.com \
    --cc=lenb@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.