public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-06-23  9:13 Stephane Eranian
  2006-06-30 12:27 ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-06-23  9:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: eranian

This patch contains the PMU context switch routines.




--- linux-2.6.17.1.orig/perfmon/perfmon_ctxsw.c	1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.17.1/perfmon/perfmon_ctxsw.c	2006-06-21 04:22:51.000000000 -0700
@@ -0,0 +1,381 @@
+/*
+ * perfmon_cxtsw.c: perfmon2 context switch code
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+
+#ifdef CONFIG_SMP
+/*
+ * interrupts are masked, runqueue lock is held, context is locked
+ */
+void pfm_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+			struct pfm_event_set *set, int must_reload)
+{
+	u64 cur_act;
+	int reload_pmcs, reload_pmds;
+
+	BUG_ON(task->pid == 0);
+	BUG_ON(__get_cpu_var(pmu_owner));
+
+	BUG_ON(task->pfm_context != ctx);
+
+	cur_act = __get_cpu_var(pmu_activation_number);
+
+	/*
+	 * in case fo zombie, we do not complete ctswin of the
+	 * PMU, and we force a call to pfm_handle_work() to finish
+	 * cleanup, i.e., free context + smpl_buff. The reason for
+	 * deferring to pfm_handle_work() is that it is not possible
+	 * to vfree() with interrupts disabled.
+	 */
+	if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) {
+		struct thread_info *th_info;
+
+		/*
+		 * ensure everything is properly stopped
+		 */
+		__pfm_stop(ctx);
+
+		ctx->flags.trap_reason = PFM_TRAP_REASON_ZOMBIE;
+		th_info = task->thread_info;
+		set_bit(TIF_NOTIFY_RESUME, &th_info->flags);
+
+		return;
+	}
+
+	if (set->flags & PFM_SETFL_TIME_SWITCH)
+		__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+ctx->last_cpu=-1;
+	/*
+	 * if we were the last user of the PMU on that CPU,
+	 * then nothing to do except restore psr
+	 */
+	if (ctx->last_cpu == smp_processor_id() && ctx->last_act == cur_act) {
+		/*
+		 * check for forced reload conditions
+		 */
+		reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS;
+		reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS;
+	} else {
+		reload_pmcs = 1;
+		reload_pmds = 1;
+	}
+	/* consumed */
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	if (reload_pmds)
+		pfm_arch_restore_pmds(ctx, set);
+
+	/*
+	 * need to check if had in-flight interrupt in
+	 * pfm_ctxswout_thread(). If at least one bit set, then we must replay
+	 * the interrupt to avoid loosing some important performance data.
+	 */
+	if (set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).pfm_ovfl_intr_replay_count++;
+	}
+
+	if (reload_pmcs)
+		pfm_arch_restore_pmcs(ctx, set);
+
+	/*
+	 * record current activation for this context
+	 */
+	pfm_inc_activation();
+	pfm_set_last_cpu(ctx, smp_processor_id());
+	pfm_set_activation(ctx);
+
+	/*
+	 * establish new ownership.
+	 */
+	pfm_set_pmu_owner(task, ctx);
+
+	pfm_arch_ctxswin(task, ctx, set);
+}
+#else /*  !CONFIG_SMP */
+/*
+ * interrupts are disabled
+ */
+void pfm_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+			struct pfm_event_set *set, int force_reload)
+{
+	u32 set_priv_flags;
+
+	set_priv_flags = set->priv_flags;
+
+	if (set->flags & PFM_SETFL_TIME_SWITCH) {
+		__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+	}
+
+	/*
+	 * must force reload due to lazy save
+	 */
+	if (force_reload)
+		set_priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+
+	/*
+	 * check what needs to be restored.
+	 * If owner == task, our state is still live and we could
+	 * just reactivate and go. However, we need to check for the
+	 * following conditions:
+	 * 	- pmu owner != task
+	 * 	- PMDs were modified
+	 * 	- PMCs were modified
+	 * 	- arch modifies PMC to stop monitoring
+	 * 	- there was an in-flight interrupt at pfm_ctxswout_thread()
+	 *
+	 * if anyone of these is true, we cannot take the short path, i.e,
+	 * just restore info + arch_ctxswin and return
+	 */
+	if (set_priv_flags & PFM_SETFL_PRIV_MOD_PMDS)
+		pfm_arch_restore_pmds(ctx, set);
+
+	/*
+	 * need to check if had in-flight interrupt at time of pfm_ctxswout_thread().
+	 * If at least one bit set, then we must replay the interrupt to avoid
+	 * losing some important performance data.
+	 */
+	if (set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).pfm_ovfl_intr_replay_count++;
+	}
+
+	if (set_priv_flags & PFM_SETFL_PRIV_MOD_PMCS)
+		pfm_arch_restore_pmcs(ctx, set);
+
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	/*
+	 * establish new ownership.
+	 */
+	pfm_set_pmu_owner(task, ctx);
+
+	/*
+	 * reactivate monitoring
+	 */
+	pfm_arch_ctxswin(task, ctx, set);
+}
+#endif /* !CONFIG_SMP */
+
+static void pfm_ctxswin_sys(struct task_struct *task, struct pfm_context *ctx,
+			    struct pfm_event_set *set)
+{
+	unsigned long info;
+
+	info = __get_cpu_var(pfm_syst_info);
+
+	/*
+	 * don't do anything before started
+	 */
+	if (ctx->flags.started == 0)
+		return;
+
+	/*
+	 * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
+	 * on each CPU, so we can rely on the pid to identify the idle task.
+	 */
+	if (task->pid == 0 && (set->flags & PFM_SETFL_EXCL_IDLE) != 0)
+		pfm_arch_stop(task ,ctx, set);
+	else
+		pfm_arch_ctxswin(task, ctx, set);
+}
+
+void __pfm_ctxswin(struct task_struct *task)
+{
+	struct pfm_context *ctx, *ctxp;
+	struct pfm_event_set *set;
+	int must_force_reload = 0;
+	u64 now_itc;
+
+	ctxp = __get_cpu_var(pmu_ctx);
+	ctx = task->pfm_context;
+
+	/*
+	 * system-wide   : pmu_ctx must not be NULL to proceed
+	 * per-thread  UP: pmu_ctx may be NULL if no left-over owner
+	 * per-thread SMP: pmu_ctx is always NULL coming in
+	 */
+	if (ctxp == NULL && ctx == NULL)
+		return;
+
+#ifdef CONFIG_SMP
+	/*
+	 * if ctxp != 0, it means we are in system-wide mode.
+	 * thereore ctx is NULL (mutual exclusion)
+	 */
+	if (ctxp)
+		ctx = ctxp;
+#else
+	/*
+	 * someone used the PMU, first push it out and
+	 * then we'll be able to install our stuff !
+	 */
+	if (ctxp && ctxp->flags.system)
+		ctx = ctxp;
+	else if (ctx) {
+		if (ctxp && ctxp != ctx) {
+			pfm_save_pmds_release(ctxp);
+			must_force_reload = 1;
+		}
+	} else
+		return;
+#endif
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	if (ctx->flags.system)
+		pfm_ctxswin_sys(task, ctx, set);
+	else
+		pfm_ctxswin_thread(task, ctx, set, must_force_reload);
+
+	/*
+	 * ctx->duration does count even when context in MASKED state
+	 * set->duration does not count when context in MASKED state.
+	 * But the set->duration_start is reset in unmask_monitoring()
+	 */
+
+	now_itc = pfm_arch_get_itc();
+
+	ctx->duration_start = now_itc;
+	set->duration_start = now_itc;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * interrupts are masked, runqueue lock is held.
+ *
+ * In UP. we simply stop monitoring and leave the state
+ * in place, i.e., lazy save
+ */
+void pfm_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+			 struct pfm_event_set *set)
+{
+	BUG_ON(task->pfm_context != ctx);
+
+	/*
+	 * stop monitoring and collect any pending
+	 * overflow information into set_povfl_pmds
+	 * and set_npend_ovfls for use on ctxswin_thread()
+	 * to potentially replay the PMU interrupt
+	 *
+	 * The key point is that we cannot afford to loose a PMU
+	 * interrupt. We cannot cancel in-flight interrupts, therefore
+	 * we let them happen and be treated as spurious and then we
+	 * replay them on ctxsw in.
+	 */
+	pfm_arch_ctxswout(task, ctx, set);
+
+#ifdef CONFIG_SMP
+	/*
+	 * release ownership of this PMU.
+	 * PM interrupts are masked, so nothing
+	 * can happen.
+	 */
+	pfm_set_pmu_owner(NULL, NULL);
+
+	/*
+	 * we systematically save the PMD that we effectively
+	 * use. In SMP, we have no guarantee we will be scheduled
+	 * on the same CPU again.
+	 */
+	pfm_modview_begin(set);
+	pfm_arch_save_pmds(ctx, set);
+	pfm_modview_end(set);
+#endif
+
+	/*
+	 * clear cpuinfo, cpuinfo is used in
+	 * per task mode with the set time switch flag.
+	 */
+	__get_cpu_var(pfm_syst_info) = 0;
+}
+
+static void pfm_ctxswout_sys(struct task_struct *task, struct pfm_context *ctx,
+			     struct pfm_event_set *set)
+{
+	/*
+	 * do nothing before started
+	 * XXX: assumes cannot be started from user level
+	 */
+	if (ctx->flags.started == 0)
+		return;
+
+	/*
+	 * restore monitoring if set has EXCL_IDLE and task was idle task
+	 */
+	if (task->pid == 0 && (set->flags & PFM_SETFL_EXCL_IDLE) != 0) {
+		pfm_arch_start(task, ctx, set);
+	} else {
+		pfm_arch_ctxswout(task, ctx, set);
+	}
+}
+
+/*
+ * we come here on every context switch out.
+ */
+void __pfm_ctxswout(struct task_struct *task)
+{
+	struct pfm_context *ctx;
+	struct pfm_event_set *set;
+	u64 now_itc, diff;
+
+	ctx = __get_cpu_var(pmu_ctx);
+	if (ctx == NULL)
+		return;
+
+	now_itc = pfm_arch_get_itc();
+
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	if (ctx->flags.system) {
+		pfm_ctxswout_sys(task, ctx, set);
+	} else {
+		/*
+		 * in UP, due to lazy save, we may have a
+		 * context loaded onto the PMU BUT it may not
+		 * be the one from the current task. In that case
+		 * simply skip everything else
+		 */
+		if (task->pfm_context == NULL)
+			goto done;
+
+		pfm_ctxswout_thread(task, ctx, set);
+	}
+
+	diff = now_itc - ctx->duration_start;
+	ctx->duration += diff;
+
+	/*
+	 * accumulate only when set is actively monitoring,
+	 */
+	if (ctx->state == PFM_CTX_LOADED)
+		set->duration += now_itc - set->duration_start;
+
+done:
+	spin_unlock(&ctx->lock);
+}

^ permalink raw reply	[flat|nested] 25+ messages in thread
* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-06-30 18:33 Chuck Ebbert
  2006-06-30 18:42 ` Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Chuck Ebbert @ 2006-06-30 18:33 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, Stephane Eranian

In-Reply-To: <200606301541.22928.ak@suse.de>

On Fri, 30 Jun 2006 15:41:22 +0200, Andi Kleen wrote:

> > So why do we need care about context switch in cpu-wide mode?
> > It is because we support a mode where the idle thread is excluded
> > from cpu-wide monitoring. This is very useful to distinguish 
> > 'useful kernel work' from 'idle'. 
> 
> I don't quite see the point because on x86 the PMU doesn't run
> during C states anyways. So you get idle excluded automatically.

Looks like it does run:

$ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
<session to end in 10 seconds>
CPU0     60351837 CPU_CLK_UNHALTED
CPU0    346548229 INTERRUPTS_MASKED_CYCLES

The CPU spent ~60 million clocks unhalted and ~350 million with interrupts
disabled.  (This is an idle 1.6GHz Turion64 machine.)

Now let's see what happens when we exclude the idle thread:

$ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10 --excl-idle
<session to end in 10 seconds>
CPU0    449250 CPU_CLK_UNHALTED
CPU0    161577 INTERRUPTS_MASKED_CYCLES

Looks like excluding the idle thread means interrupts that happen while idle
don't get counted either.  We took 5000 clock interrupts and I know they take
longer than that to process.

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread
* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-06-30 19:17 Chuck Ebbert
  2006-06-30 19:37 ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Chuck Ebbert @ 2006-06-30 19:17 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Stephane Eranian, linux-kernel

In-Reply-To: <200606302042.23661.ak@suse.de>

On Fri, 30 Jun 2006 20:42:23 +0200. Andi Kleen wrote:

> > > I don't quite see the point because on x86 the PMU doesn't run
> > > during C states anyways. So you get idle excluded automatically.
> > 
> > Looks like it does run:
> 
> I'm pretty sure it doesn't. You can see it by watching 
> the frequency of the perfctr mode NMI watchdog in /proc/interrupts 
> under different loads.
>
> When the system is idle the frequency goes down and increases
> when the system is busy.

But that is using cpu_clk_unhalted (isn't it?)  If so, it would slow down
when the system is idle.

The BIOS writer's guide, Ch. 10.2, says only events outside of the
processor, like northbridge DMA accesses, stop counting during halt.
(And by definition cpu_clk_unhalted.)

> Are you sure you didn't boot with poll=idle?

$ pfmon --smpl-module=inst-hist --smpl-show-function --smpl-show-top=40 \
  -ecpu_clk_unhalted -k --long-smpl-period=10000 --resolve-addr --system-wide -t 10
only kernel symbols are resolved in system-wide mode
<session to end in 10 seconds>
# counts   %self    %cum code address
    2501  85.42%  85.42% __do_softirq<kernel>
     222   7.58%  93.00% acpi_processor_idle<kernel>   <========
     100   3.42%  96.41% ehci_watchdog<kernel>
      39   1.33%  97.75% ehci_hub_status_data<kernel>

I'm pretty sure. :)  Looking at that pile of code in acpi_processor_idle
and the way it disables interrupts I think I'll switch to idle=halt, though.

> Otherwise something must be wrong with your measurements.

In that case it's all Stephane's fault: he wrote the code!

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread
* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-07-01 15:21 Chuck Ebbert
  2006-07-04 15:28 ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Chuck Ebbert @ 2006-07-01 15:21 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, Andi Kleen

In-Reply-To: <20060630204032.GB22835@frankl.hpl.hp.com>

On Fri, 30 Jun 2006 13:40:32 -0700, Stephane Eranian wrote:

> As Andi is suggesting, I think this may depends on how the BIOS implements
> the low-power state. I have tried the same command on my dual Opteron 250
> 2.4GHz and I get:
> $ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0                     9,520,303 CPU_CLK_UNHALTED
> CPU0                     3,726,315 INTERRUPTS_MASKED_CYCLES
> CPU1                    21,268,151 CPU_CLK_UNHALTED
> CPU1                    14,515,389 INTERRUPTS_MASKED_CYCLES

That is similar to what I get with idle=halt. Are you not using ACPI
for idle?

Try this:

$ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
<session to end in 10 seconds>
CPU0     95016828 CPU_CLK_UNHALTED
CPU0     36472783 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
CPU0     67484408 INTERRUPTS_MASKED_CYCLES
CPU0    445326968 CYCLES_NO_FPU_OPS_RETIRED

That's what I get with idle=halt.  Since the kernel doesn't do FP
the last line should equal clock cycles.  If it were running at full
speed it would be 16 billion...

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread
* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-07-06 17:30 Chuck Ebbert
  2006-07-06 20:16 ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Chuck Ebbert @ 2006-07-06 17:30 UTC (permalink / raw)
  To: eranian@hpl.hp.com; +Cc: Andi Kleen, linux-kernel

In-Reply-To: <20060704152857.GA6999@frankl.hpl.hp.com>

On Tue, 4 Jul 2006 08:28:57 -0700, Stephan Eranian wrote:

> Here is what I get on my dual 2.4GHz Opteron 250:
> 
> booted with idle=halt
> $ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0                    11,356,210 CPU_CLK_UNHALTED                               
> CPU0                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> CPU0                     3,836,107 INTERRUPTS_MASKED_CYCLES                       
> CPU0                23,910,784,532 CYCLES_NO_FPU_OPS_RETIRED                      
> CPU1                    19,303,632 CPU_CLK_UNHALTED                               
> CPU1                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> CPU1                    13,942,265 INTERRUPTS_MASKED_CYCLES                       
> CPU1                23,911,872,654 CYCLES_NO_FPU_OPS_RETIRED                      

So it looks like your Opteron continues to count CYCLES_NO_FPU_OPS_RETIRED
at full clock speed even when halted.

My Turion appears to slow down to ~40 MHz when halted and counts those
events at that speed.  Using idle=poll shows no slowdown, as expected,
and unhalted clocks equals cycles_no_fpu_ops_retired.

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2006-07-06 20:24 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-23  9:13 [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch Stephane Eranian
2006-06-30 12:27 ` Andi Kleen
2006-06-30 12:36   ` Stephane Eranian
2006-06-30 12:59     ` Andi Kleen
2006-06-30 13:29       ` Stephane Eranian
2006-06-30 13:41         ` Andi Kleen
2006-06-30 14:12           ` Stephane Eranian
2006-06-30 14:33             ` Andi Kleen
2006-06-30 16:02               ` Stephane Eranian
2006-06-30 17:08                 ` Andi Kleen
2006-06-30 20:47                   ` Stephane Eranian
2006-07-03  9:49       ` Stephane Eranian
2006-07-03 19:25         ` Andi Kleen
2006-07-03 19:22           ` Stephane Eranian
2006-07-03 19:36             ` Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2006-06-30 18:33 Chuck Ebbert
2006-06-30 18:42 ` Andi Kleen
2006-06-30 18:43 ` Stephane Eranian
2006-06-30 20:40 ` Stephane Eranian
2006-06-30 19:17 Chuck Ebbert
2006-06-30 19:37 ` Andi Kleen
2006-07-01 15:21 Chuck Ebbert
2006-07-04 15:28 ` Stephane Eranian
2006-07-06 17:30 Chuck Ebbert
2006-07-06 20:16 ` Stephane Eranian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox