public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Trace idle entry and exit times
@ 2013-12-07  0:46 Andi Kleen
  2013-12-07  4:57 ` Mike Galbraith
  0 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2013-12-07  0:46 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, Andi Kleen, lenb, tglx, peterz

From: Andi Kleen <ak@linux.intel.com>

This patch creates 4 new trace points: one at the beginning of idle,
one before entering mwait, one after exiting mwait, and one after
finishing the idle code. This is useful to trace down any
additional latencies caused by the ever-growing idle code.

Idle latencies are a common source of performance problems, so it's
important to be able to measure them.

I moved mwait out of line to avoid a nasty include loop.

To use:

trace-cmd record -e power:cpu_idle_end -e power:cpu_idle_start -e power:cpu_idle_mwait_end -e power:cpu_idle_mwait_start
run workload
Ctrl-C
trace-cmd report

Compute per CPU intervals between
power:cpu_idle_start and power:cpu_idle_mwait_start
to get the entry latency

and

power:cpu_idle_mwait_end and power:cpu_idle_end
to get the exit latency (likely more important)

Cc: lenb@kernel.org
Cc: tglx@linutronix.de
Cc: peterz@infradead.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/processor.h |  7 +------
 arch/x86/kernel/process.c        |  9 +++++++++
 include/trace/events/power.h     | 30 ++++++++++++++++++++++++++++++
 kernel/cpu/idle.c                |  3 +++
 4 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4..22deb08 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -708,12 +708,7 @@ static inline void __monitor(const void *eax, unsigned long ecx,
 		     :: "a" (eax), "c" (ecx), "d"(edx));
 }
 
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-	/* "mwait %eax, %ecx;" */
-	asm volatile(".byte 0x0f, 0x01, 0xc9;"
-		     :: "a" (eax), "c" (ecx));
-}
+extern void __mwait(unsigned long eax, unsigned long ecx);
 
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95..9083e34 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -469,3 +469,12 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
 
+void __mwait(unsigned long eax, unsigned long ecx)
+{
+	trace_cpu_idle_mwait_start(0);
+	/* "mwait %eax, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+	trace_cpu_idle_mwait_end(0);
+}
+EXPORT_SYMBOL(__mwait);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index cda100d..71261a7 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -35,6 +35,36 @@ DEFINE_EVENT(cpu, cpu_idle,
 	TP_ARGS(state, cpu_id)
 );
 
+/* Dummy argument to work around ftrace's lack of no argument support */
+
+TRACE_EVENT(cpu_idle_start,
+	TP_PROTO(int x),
+	TP_ARGS(x),
+	TP_STRUCT__entry(__array(int, x, 0)),
+	TP_fast_assign((void)(x)),
+	TP_printk("%s", ""))
+
+TRACE_EVENT(cpu_idle_mwait_start,
+	TP_PROTO(int x),
+	TP_ARGS(x),
+	TP_STRUCT__entry(__array(int, x, 0)),
+	TP_fast_assign((void)(x)),
+	TP_printk("%s", ""));
+
+TRACE_EVENT(cpu_idle_mwait_end,
+	TP_PROTO(int x),
+	TP_ARGS(x),
+	TP_STRUCT__entry(__array(int, x, 0)),
+	TP_fast_assign((void)(x)),
+	TP_printk("%s", ""));
+
+TRACE_EVENT(cpu_idle_end,
+	TP_PROTO(int x),
+	TP_ARGS(x),
+	TP_STRUCT__entry(__array(int, x, 0)),
+	TP_fast_assign((void)(x)),
+	TP_printk("%s", ""));
+
 /* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
 #ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
 #define _PWR_EVENT_AVOID_DOUBLE_DEFINING
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 988573a..1cf1676 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -6,6 +6,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <trace/events/power.h>
 
 #include <asm/tlb.h>
 
@@ -68,6 +69,7 @@ void __weak arch_cpu_idle(void)
 static void cpu_idle_loop(void)
 {
 	while (1) {
+		trace_cpu_idle_start(0);
 		tick_nohz_idle_enter();
 
 		while (!need_resched()) {
@@ -114,6 +116,7 @@ static void cpu_idle_loop(void)
 				set_preempt_need_resched();
 		}
 		tick_nohz_idle_exit();
+		trace_cpu_idle_end(0);
 		schedule_preempt_disabled();
 	}
 }
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] Trace idle entry and exit times
  2013-12-07  0:46 [PATCH] Trace idle entry and exit times Andi Kleen
@ 2013-12-07  4:57 ` Mike Galbraith
  2013-12-07  5:20   ` Andi Kleen
  0 siblings, 1 reply; 5+ messages in thread
From: Mike Galbraith @ 2013-12-07  4:57 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, x86, Andi Kleen, lenb, tglx, peterz

On Fri, 2013-12-06 at 16:46 -0800, Andi Kleen wrote: 
> From: Andi Kleen <ak@linux.intel.com>
> 
> This patch creates 4 new trace points: one at the beginning of idle,
> one before entering mwait, one after exiting mwait, and one after
> finishing the idle code. This is useful to trace down any
> additional latencies caused by the ever-growing idle code.
> 
> Idle latencies are a common source of performance problems, so it's
> important to be able to measure them.

You mean stuff like.. oh, say 60 cores doing zero work every 12ms
munching ~20% cpu?

-  97.57%  [kernel]          [k] _raw_spin_lock_irqsave                                                                                                                                                                                    ▒
   - _raw_spin_lock_irqsave                                                                                                                                                                                                                ▒
      - 97.02% clockevents_notify                                                                                                                                                                                                          ◆
           intel_idle                                                                                                                                                                                                                      ▒
           cpuidle_idle_call                                                                                                                                                                                                               ▒
           cpu_idle                                                                                                                                                                                                                        ▒
      - 2.97% tick_broadcast_oneshot_control                                                                                                                                                                                               ▒
           tick_notify                                                                                                                                                                                                                     ▒
           notifier_call_chain                                                                                                                                                                                                             ▒
           clockevents_notify                                                                                                                                                                                                              ▒
           intel_idle                                                                                                                                                                                                                      ▒
           cpuidle_idle_call                                                                                                                                                                                                               ▒
           cpu_idle

C2 is super green, it's hug'n the sh*t outta them trees :)

	-Mike


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Trace idle entry and exit times
  2013-12-07  4:57 ` Mike Galbraith
@ 2013-12-07  5:20   ` Andi Kleen
  2013-12-07  8:22     ` Mike Galbraith
  0 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2013-12-07  5:20 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Andi Kleen, linux-kernel, x86, Andi Kleen, lenb, tglx, peterz

On Sat, Dec 07, 2013 at 05:57:32AM +0100, Mike Galbraith wrote:
> On Fri, 2013-12-06 at 16:46 -0800, Andi Kleen wrote: 
> > From: Andi Kleen <ak@linux.intel.com>
> > 
> > This patch creates 4 new trace points: one at the beginning of idle,
> > one before entering mwait, one after exiting mwait, and one after
> > finishing the idle code. This is useful to trace down any
> > additional latencies caused by the ever-growing idle code.
> > 
> > Idle latencies are a common source of performance problems, so it's
> > important to be able to measure them.
> 
> You mean stuff like.. oh, say 60 cores doing zero work every 12ms
> munching ~20% cpu?

Well that one is obvious. I mean more the subtle problems.

BTW the timer problem is fixed on Westmere and later (always running APIC
timer), but yes it's a problem before that. I looked at it some time ago,
but eventually gave up after only fixing low hanging fruit. There are likely 
some more tricks that could be done here (see the old patches from back then)

-Andi

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Trace idle entry and exit times
  2013-12-07  5:20   ` Andi Kleen
@ 2013-12-07  8:22     ` Mike Galbraith
  2013-12-07 17:29       ` Andi Kleen
  0 siblings, 1 reply; 5+ messages in thread
From: Mike Galbraith @ 2013-12-07  8:22 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, x86, Andi Kleen, lenb, tglx, peterz

On Sat, 2013-12-07 at 06:20 +0100, Andi Kleen wrote: 
> On Sat, Dec 07, 2013 at 05:57:32AM +0100, Mike Galbraith wrote:
> > On Fri, 2013-12-06 at 16:46 -0800, Andi Kleen wrote: 
> > > From: Andi Kleen <ak@linux.intel.com>
> > > 
> > > This patch creates 4 new trace points: one at the beginning of idle,
> > > one before entering mwait, one after exiting mwait, and one after
> > > finishing the idle code. This is useful to trace down any
> > > additional latencies caused by the ever-growing idle code.
> > > 
> > > Idle latencies are a common source of performance problems, so it's
> > > important to be able to measure them.
> > 
> > You mean stuff like.. oh, say 60 cores doing zero work every 12ms
> > munching ~20% cpu?
> 
> Well that one is obvious. I mean more the subtle problems.

Yeah, not subtle.  SGI could use C2 as power supply stress tester.

-Mike


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Trace idle entry and exit times
  2013-12-07  8:22     ` Mike Galbraith
@ 2013-12-07 17:29       ` Andi Kleen
  0 siblings, 0 replies; 5+ messages in thread
From: Andi Kleen @ 2013-12-07 17:29 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Andi Kleen, linux-kernel, x86, Andi Kleen, lenb, tglx, peterz

> > Well that one is obvious. I mean more the subtle problems.
> 
> Yeah, not subtle.  SGI could use C2 as power supply stress tester.

This is only on system where the APIC timer does not tick in deep idle.
At this point these are 5+ year old systems.

There were some ideas of breaking up that lock, but they were never
fully implemented.

-Andi

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2013-12-07 17:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-07  0:46 [PATCH] Trace idle entry and exit times Andi Kleen
2013-12-07  4:57 ` Mike Galbraith
2013-12-07  5:20   ` Andi Kleen
2013-12-07  8:22     ` Mike Galbraith
2013-12-07 17:29       ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox