[PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-06-23  9:13 Stephane Eranian
  2006-06-30 12:27 ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-06-23  9:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: eranian

This patch contains the PMU context switch routines.




--- linux-2.6.17.1.orig/perfmon/perfmon_ctxsw.c	1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.17.1/perfmon/perfmon_ctxsw.c	2006-06-21 04:22:51.000000000 -0700
@@ -0,0 +1,381 @@
+/*
+ * perfmon_cxtsw.c: perfmon2 context switch code
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+
+#ifdef CONFIG_SMP
+/*
+ * interrupts are masked, runqueue lock is held, context is locked
+ */
+void pfm_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+			struct pfm_event_set *set, int must_reload)
+{
+	u64 cur_act;
+	int reload_pmcs, reload_pmds;
+
+	BUG_ON(task->pid == 0);
+	BUG_ON(__get_cpu_var(pmu_owner));
+
+	BUG_ON(task->pfm_context != ctx);
+
+	cur_act = __get_cpu_var(pmu_activation_number);
+
+	/*
+	 * in case fo zombie, we do not complete ctswin of the
+	 * PMU, and we force a call to pfm_handle_work() to finish
+	 * cleanup, i.e., free context + smpl_buff. The reason for
+	 * deferring to pfm_handle_work() is that it is not possible
+	 * to vfree() with interrupts disabled.
+	 */
+	if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) {
+		struct thread_info *th_info;
+
+		/*
+		 * ensure everything is properly stopped
+		 */
+		__pfm_stop(ctx);
+
+		ctx->flags.trap_reason = PFM_TRAP_REASON_ZOMBIE;
+		th_info = task->thread_info;
+		set_bit(TIF_NOTIFY_RESUME, &th_info->flags);
+
+		return;
+	}
+
+	if (set->flags & PFM_SETFL_TIME_SWITCH)
+		__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+ctx->last_cpu=-1;
+	/*
+	 * if we were the last user of the PMU on that CPU,
+	 * then nothing to do except restore psr
+	 */
+	if (ctx->last_cpu == smp_processor_id() && ctx->last_act == cur_act) {
+		/*
+		 * check for forced reload conditions
+		 */
+		reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS;
+		reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS;
+	} else {
+		reload_pmcs = 1;
+		reload_pmds = 1;
+	}
+	/* consumed */
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	if (reload_pmds)
+		pfm_arch_restore_pmds(ctx, set);
+
+	/*
+	 * need to check if had in-flight interrupt in
+	 * pfm_ctxswout_thread(). If at least one bit set, then we must replay
+	 * the interrupt to avoid loosing some important performance data.
+	 */
+	if (set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).pfm_ovfl_intr_replay_count++;
+	}
+
+	if (reload_pmcs)
+		pfm_arch_restore_pmcs(ctx, set);
+
+	/*
+	 * record current activation for this context
+	 */
+	pfm_inc_activation();
+	pfm_set_last_cpu(ctx, smp_processor_id());
+	pfm_set_activation(ctx);
+
+	/*
+	 * establish new ownership.
+	 */
+	pfm_set_pmu_owner(task, ctx);
+
+	pfm_arch_ctxswin(task, ctx, set);
+}
+#else /*  !CONFIG_SMP */
+/*
+ * interrupts are disabled
+ */
+void pfm_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+			struct pfm_event_set *set, int force_reload)
+{
+	u32 set_priv_flags;
+
+	set_priv_flags = set->priv_flags;
+
+	if (set->flags & PFM_SETFL_TIME_SWITCH) {
+		__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+	}
+
+	/*
+	 * must force reload due to lazy save
+	 */
+	if (force_reload)
+		set_priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+
+	/*
+	 * check what needs to be restored.
+	 * If owner == task, our state is still live and we could
+	 * just reactivate and go. However, we need to check for the
+	 * following conditions:
+	 * 	- pmu owner != task
+	 * 	- PMDs were modified
+	 * 	- PMCs were modified
+	 * 	- arch modifies PMC to stop monitoring
+	 * 	- there was an in-flight interrupt at pfm_ctxswout_thread()
+	 *
+	 * if anyone of these is true, we cannot take the short path, i.e,
+	 * just restore info + arch_ctxswin and return
+	 */
+	if (set_priv_flags & PFM_SETFL_PRIV_MOD_PMDS)
+		pfm_arch_restore_pmds(ctx, set);
+
+	/*
+	 * need to check if had in-flight interrupt at time of pfm_ctxswout_thread().
+	 * If at least one bit set, then we must replay the interrupt to avoid
+	 * losing some important performance data.
+	 */
+	if (set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).pfm_ovfl_intr_replay_count++;
+	}
+
+	if (set_priv_flags & PFM_SETFL_PRIV_MOD_PMCS)
+		pfm_arch_restore_pmcs(ctx, set);
+
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	/*
+	 * establish new ownership.
+	 */
+	pfm_set_pmu_owner(task, ctx);
+
+	/*
+	 * reactivate monitoring
+	 */
+	pfm_arch_ctxswin(task, ctx, set);
+}
+#endif /* !CONFIG_SMP */
+
+static void pfm_ctxswin_sys(struct task_struct *task, struct pfm_context *ctx,
+			    struct pfm_event_set *set)
+{
+	unsigned long info;
+
+	info = __get_cpu_var(pfm_syst_info);
+
+	/*
+	 * don't do anything before started
+	 */
+	if (ctx->flags.started == 0)
+		return;
+
+	/*
+	 * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
+	 * on each CPU, so we can rely on the pid to identify the idle task.
+	 */
+	if (task->pid == 0 && (set->flags & PFM_SETFL_EXCL_IDLE) != 0)
+		pfm_arch_stop(task ,ctx, set);
+	else
+		pfm_arch_ctxswin(task, ctx, set);
+}
+
+void __pfm_ctxswin(struct task_struct *task)
+{
+	struct pfm_context *ctx, *ctxp;
+	struct pfm_event_set *set;
+	int must_force_reload = 0;
+	u64 now_itc;
+
+	ctxp = __get_cpu_var(pmu_ctx);
+	ctx = task->pfm_context;
+
+	/*
+	 * system-wide   : pmu_ctx must not be NULL to proceed
+	 * per-thread  UP: pmu_ctx may be NULL if no left-over owner
+	 * per-thread SMP: pmu_ctx is always NULL coming in
+	 */
+	if (ctxp == NULL && ctx == NULL)
+		return;
+
+#ifdef CONFIG_SMP
+	/*
+	 * if ctxp != 0, it means we are in system-wide mode.
+	 * thereore ctx is NULL (mutual exclusion)
+	 */
+	if (ctxp)
+		ctx = ctxp;
+#else
+	/*
+	 * someone used the PMU, first push it out and
+	 * then we'll be able to install our stuff !
+	 */
+	if (ctxp && ctxp->flags.system)
+		ctx = ctxp;
+	else if (ctx) {
+		if (ctxp && ctxp != ctx) {
+			pfm_save_pmds_release(ctxp);
+			must_force_reload = 1;
+		}
+	} else
+		return;
+#endif
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	if (ctx->flags.system)
+		pfm_ctxswin_sys(task, ctx, set);
+	else
+		pfm_ctxswin_thread(task, ctx, set, must_force_reload);
+
+	/*
+	 * ctx->duration does count even when context in MASKED state
+	 * set->duration does not count when context in MASKED state.
+	 * But the set->duration_start is reset in unmask_monitoring()
+	 */
+
+	now_itc = pfm_arch_get_itc();
+
+	ctx->duration_start = now_itc;
+	set->duration_start = now_itc;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * interrupts are masked, runqueue lock is held.
+ *
+ * In UP. we simply stop monitoring and leave the state
+ * in place, i.e., lazy save
+ */
+void pfm_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+			 struct pfm_event_set *set)
+{
+	BUG_ON(task->pfm_context != ctx);
+
+	/*
+	 * stop monitoring and collect any pending
+	 * overflow information into set_povfl_pmds
+	 * and set_npend_ovfls for use on ctxswin_thread()
+	 * to potentially replay the PMU interrupt
+	 *
+	 * The key point is that we cannot afford to loose a PMU
+	 * interrupt. We cannot cancel in-flight interrupts, therefore
+	 * we let them happen and be treated as spurious and then we
+	 * replay them on ctxsw in.
+	 */
+	pfm_arch_ctxswout(task, ctx, set);
+
+#ifdef CONFIG_SMP
+	/*
+	 * release ownership of this PMU.
+	 * PM interrupts are masked, so nothing
+	 * can happen.
+	 */
+	pfm_set_pmu_owner(NULL, NULL);
+
+	/*
+	 * we systematically save the PMD that we effectively
+	 * use. In SMP, we have no guarantee we will be scheduled
+	 * on the same CPU again.
+	 */
+	pfm_modview_begin(set);
+	pfm_arch_save_pmds(ctx, set);
+	pfm_modview_end(set);
+#endif
+
+	/*
+	 * clear cpuinfo, cpuinfo is used in
+	 * per task mode with the set time switch flag.
+	 */
+	__get_cpu_var(pfm_syst_info) = 0;
+}
+
+static void pfm_ctxswout_sys(struct task_struct *task, struct pfm_context *ctx,
+			     struct pfm_event_set *set)
+{
+	/*
+	 * do nothing before started
+	 * XXX: assumes cannot be started from user level
+	 */
+	if (ctx->flags.started == 0)
+		return;
+
+	/*
+	 * restore monitoring if set has EXCL_IDLE and task was idle task
+	 */
+	if (task->pid == 0 && (set->flags & PFM_SETFL_EXCL_IDLE) != 0) {
+		pfm_arch_start(task, ctx, set);
+	} else {
+		pfm_arch_ctxswout(task, ctx, set);
+	}
+}
+
+/*
+ * we come here on every context switch out.
+ */
+void __pfm_ctxswout(struct task_struct *task)
+{
+	struct pfm_context *ctx;
+	struct pfm_event_set *set;
+	u64 now_itc, diff;
+
+	ctx = __get_cpu_var(pmu_ctx);
+	if (ctx == NULL)
+		return;
+
+	now_itc = pfm_arch_get_itc();
+
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	if (ctx->flags.system) {
+		pfm_ctxswout_sys(task, ctx, set);
+	} else {
+		/*
+		 * in UP, due to lazy save, we may have a
+		 * context loaded onto the PMU BUT it may not
+		 * be the one from the current task. In that case
+		 * simply skip everything else
+		 */
+		if (task->pfm_context == NULL)
+			goto done;
+
+		pfm_ctxswout_thread(task, ctx, set);
+	}
+
+	diff = now_itc - ctx->duration_start;
+	ctx->duration += diff;
+
+	/*
+	 * accumulate only when set is actively monitoring,
+	 */
+	if (ctx->state == PFM_CTX_LOADED)
+		set->duration += now_itc - set->duration_start;
+
+done:
+	spin_unlock(&ctx->lock);
+}

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-23  9:13 Stephane Eranian
@ 2006-06-30 12:27 ` Andi Kleen
  2006-06-30 12:36   ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 12:27 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: eranian, linux-kernel

Stephane Eranian <eranian@frankl.hpl.hp.com> writes:

> This patch contains the PMU context switch routines.

Description/why/what etc. missing.

<quick look>

This is all unconditionally called at context switch?!??

No way this can be merged. It needs to be zero cost for any process
that doesn't use perfmon and even for those it probably needs 
some tuning.

See my earlier mail on how to make it zero cost for i386/x86-64.

Please don't submit such horrible code again.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 12:27 ` Andi Kleen
@ 2006-06-30 12:36   ` Stephane Eranian
  2006-06-30 12:59     ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 12:36 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Andi,

Thanks for your feedback. I will make the changes you
requested.

About the context switch code, what about I do the following
in __switch_to():

__kprobes struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
        struct thread_struct *prev = &prev_p->thread,
                                 *next = &next_p->thread;
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);

        if (unlikely(__get_cpu_var(pmu_ctx) || next_p->pfm_context))
                __pfm_ctxswout(prev_p, next_p);

        /*
         * Reload esp0, LDT and the page table pointer:
         */
        tss->rsp0 = next->rsp0;

There is now a single hook and a conditional branch.
this is similar to what you have with the debug registers.

--
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 12:36   ` Stephane Eranian
@ 2006-06-30 12:59     ` Andi Kleen
  2006-06-30 13:29       ` Stephane Eranian
  2006-07-03  9:49       ` Stephane Eranian
  0 siblings, 2 replies; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 12:59 UTC (permalink / raw)
  To: eranian; +Cc: linux-kernel

Stephane Eranian <eranian@hpl.hp.com> writes:

> Andi,
> 
> Thanks for your feedback. I will make the changes you
> requested.
> 
> About the context switch code, what about I do the following
> in __switch_to():
> 
> __kprobes struct task_struct *
> __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> {
>         struct thread_struct *prev = &prev_p->thread,
>                                  *next = &next_p->thread;
>         int cpu = smp_processor_id();
>         struct tss_struct *tss = &per_cpu(init_tss, cpu);
> 
>         if (unlikely(__get_cpu_var(pmu_ctx) || next_p->pfm_context))
>                 __pfm_ctxswout(prev_p, next_p);
> 
>         /*
>          * Reload esp0, LDT and the page table pointer:
>          */
>         tss->rsp0 = next->rsp0;
> 
> There is now a single hook and a conditional branch.
> this is similar to what you have with the debug registers.

It's still more than there was before. Also __get_cpu_var 
is quite a lot of instructions.

I would suggest you borrow some bits in one of the process
or thread info flags and then do a single test

if (unlikely(thr->flags & (DEBUG|PERFMON)) != 0) { 
        if (flags & DEBUG)
                ... do debug ...
        if (flags & PERFMON)
                ... do perfmon ...
}

[which you're at it you can probably add ioports in there too -
improving existing code is always a good thing]

Ideally flags is in some cache line that is already 
touched during context switch. If not you might need
to change the layout.

It's ok to put the do_perfmon stuff into a separate noinline
function because that will disturb the register allocation
in the caller less.

I would suggest doing this in separate preparing patches that
first just do it for existing facilities.

-Andi

P.S.: My comments probably apply to the i386 versions too
although I haven't read them.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 12:59     ` Andi Kleen
@ 2006-06-30 13:29       ` Stephane Eranian
  2006-06-30 13:41         ` Andi Kleen
  2006-07-03  9:49       ` Stephane Eranian
  1 sibling, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 13:29 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Andi,

On Fri, Jun 30, 2006 at 02:59:05PM +0200, Andi Kleen wrote:
> Stephane Eranian <eranian@hpl.hp.com> writes:
> > 
> > __kprobes struct task_struct *
> > __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> > {
> >         struct thread_struct *prev = &prev_p->thread,
> >                                  *next = &next_p->thread;
> >         int cpu = smp_processor_id();
> >         struct tss_struct *tss = &per_cpu(init_tss, cpu);
> > 
> >         if (unlikely(__get_cpu_var(pmu_ctx) || next_p->pfm_context))
> >                 __pfm_ctxswout(prev_p, next_p);
> > 
> >         /*
> >          * Reload esp0, LDT and the page table pointer:
> >          */
> >         tss->rsp0 = next->rsp0;
> > 
> > There is now a single hook and a conditional branch.
> > this is similar to what you have with the debug registers.
> 
> It's still more than there was before. Also __get_cpu_var 
> is quite a lot of instructions.
> 
Ok, let me explain why we have the pmu_tx per-cpu variable.
Perfmon supports per thread and cpu-wide monitoring (across all
threads running on one CPU).

In per-thread mode, a perfmon context is attached to the monitored
thread in task->pfm_context. When the thread is context switched in 
that context is also stored into the per-cpu variable to indicate
the currently active context. This may seem a bit redundant because
in this mode current->pfm_context == pmu_ctx in SMP mode. However
in UP mode, we support lazy save/restore and there pmu_ctx stores
the last owner.

In cpu-wide mode, the perfmon context is not attched to any thread.
It is logically attached to the CPU being monitored and is pointed
to by pmu_ctx. That variable is used to get to the active perfmon
context on PMU interrupt for instance (because current->pfm_context is
always NULL).

So why do we need care about context switch in cpu-wide mode?
It is because we support a mode where the idle thread is excluded
from cpu-wide monitoring. This is very useful to distinguish 
'useful kernel work' from 'idle'. As you realize, that means
that we need to turn off when the idle thread is context switched
in and turn it back on when it is switched off.

Having said that, and based on your suggestion, I think we could
simply mark the PERFMON flag in the idle thread when needed in cpu-wide
mode. That would bring the test in __switch_to() to something along
those lines:

	prev = &prev_p->thread;
	next = &next_p->thread;

	if (unlikely((prev->flags & PERFMON) || (next->flags & PERFMON)))
		pfm_switch_to(prev_p, next_p);

I could split prev from next and merge next into the DEBUG and BITMAP stuff
as you suggest below.

What do you think?

> I would suggest you borrow some bits in one of the process
> or thread info flags and then do a single test
> 
> if (unlikely(thr->flags & (DEBUG|PERFMON)) != 0) { 
>         if (flags & DEBUG)
>                 ... do debug ...
>         if (flags & PERFMON)
>                 ... do perfmon ...
> }
> 
> [which you're at it you can probably add ioports in there too -
> improving existing code is always a good thing]
> 
> Ideally flags is in some cache line that is already 
> touched during context switch. If not you might need
> to change the layout.
> 
> It's ok to put the do_perfmon stuff into a separate noinline
> function because that will disturb the register allocation
> in the caller less.
> 
> I would suggest doing this in separate preparing patches that
> first just do it for existing facilities.
> 
> -Andi
> 
> P.S.: My comments probably apply to the i386 versions too
> although I haven't read them.

Yes, this is the same for i386.

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 13:29       ` Stephane Eranian
@ 2006-06-30 13:41         ` Andi Kleen
  2006-06-30 14:12           ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 13:41 UTC (permalink / raw)
  To: eranian; +Cc: linux-kernel

> So why do we need care about context switch in cpu-wide mode?
> It is because we support a mode where the idle thread is excluded
> from cpu-wide monitoring. This is very useful to distinguish 
> 'useful kernel work' from 'idle'. 

I don't quite see the point because on x86 the PMU doesn't run
during C states anyways. So you get idle excluded automatically.

And on the other hand a lot of people especially want idle
accounting too and boot with idle=poll. Your explicit 
code would likely defeat that.

> As you realize, that means 
> that we need to turn off when the idle thread is context switched
> in and turn it back on when it is switched off.

Also x86-64 has idle notifiers for this if you really wanted
to do it properly.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 13:41         ` Andi Kleen
@ 2006-06-30 14:12           ` Stephane Eranian
  2006-06-30 14:33             ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 14:12 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Andi,

On Fri, Jun 30, 2006 at 03:41:22PM +0200, Andi Kleen wrote:
> 
> > So why do we need care about context switch in cpu-wide mode?
> > It is because we support a mode where the idle thread is excluded
> > from cpu-wide monitoring. This is very useful to distinguish 
> > 'useful kernel work' from 'idle'. 
> 

The exclude-idle feature is an option you select when you create
your cpu-wide session. By default, it is off.

> I don't quite see the point because on x86 the PMU doesn't run
> during C states anyways. So you get idle excluded automatically.
> 
Yes, but that may not necessarily be troe of all architectures.
At least with the option, the interfaces provides some guarantee.

> And on the other hand a lot of people especially want idle
> accounting too and boot with idle=poll. Your explicit 
> code would likely defeat that.
> 
> > As you realize, that means 
> > that we need to turn off when the idle thread is context switched
> > in and turn it back on when it is switched off.
> 
> Also x86-64 has idle notifiers for this if you really wanted
> to do it properly.
> 
That looks like a useful feature I could leverage but why is it just
on x86-64 at the moment?

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 14:12           ` Stephane Eranian
@ 2006-06-30 14:33             ` Andi Kleen
  2006-06-30 16:02               ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 14:33 UTC (permalink / raw)
  To: eranian; +Cc: linux-kernel

On Friday 30 June 2006 16:12, Stephane Eranian wrote:
> Andi,
> 
> On Fri, Jun 30, 2006 at 03:41:22PM +0200, Andi Kleen wrote:
> > 
> > > So why do we need care about context switch in cpu-wide mode?
> > > It is because we support a mode where the idle thread is excluded
> > > from cpu-wide monitoring. This is very useful to distinguish 
> > > 'useful kernel work' from 'idle'. 
> > 
> 
> The exclude-idle feature is an option you select when you create
> your cpu-wide session. By default, it is off.
> 
> > I don't quite see the point because on x86 the PMU doesn't run
> > during C states anyways. So you get idle excluded automatically.
> > 
> Yes, but that may not necessarily be troe of all architectures.
> At least with the option, the interfaces provides some guarantee.

I don't think it makes sense to complicate the software if the
hardware already guarantees it. So please remove it.

If there is ever an PMU which ticks in C states you could readd
it, but it would surprise me if that will ever happen because
it would conflict with power saving.

Actually there is one reason to use idle notifiers anyways for the PMU -
it can be used to correct for the not ticking PMU in C. So e.g. 
you could synthesize an artificial counter out of CPU_CLK_UNHALTED
(and equivalents) + RDTSC measurements before/after idle
(+ correcting the overflows for lost time). With that people
can get full accounting including idle without doing nasty
things like idle=poll

I wanted to do that for oprofile at some point but never got
around to it. But that was one of the reasons the idle notifiers
got added.

But without that I don't think you should special case idle
at all.

> 
> > And on the other hand a lot of people especially want idle
> > accounting too and boot with idle=poll. Your explicit 
> > code would likely defeat that.
> > 
> > > As you realize, that means 
> > > that we need to turn off when the idle thread is context switched
> > > in and turn it back on when it is switched off.
> > 
> > Also x86-64 has idle notifiers for this if you really wanted
> > to do it properly.
> > 
> That looks like a useful feature I could leverage but why is it just
> on x86-64 at the moment?

s390 has it too (I stole it from there) Others could add it I guess
if there is a good case.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 14:33             ` Andi Kleen
@ 2006-06-30 16:02               ` Stephane Eranian
  2006-06-30 17:08                 ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 16:02 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Andi,

As a first step, I am looking at implementing a TIF_DEBUG
for x86-64. AFAIK, debug registers must not be inherited on
fork().

In copy_thread(), I do not see the place where the child's
debug registers (or just debugreg7) are cleared to avoid
reloading the parents values in __switch_to() if debugreg7
is not 0. I believe the debug registers values are copied
in dup_task_struct().

Am I missing something?

--
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 16:02               ` Stephane Eranian
@ 2006-06-30 17:08                 ` Andi Kleen
  2006-06-30 20:47                   ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 17:08 UTC (permalink / raw)
  To: eranian; +Cc: linux-kernel

On Friday 30 June 2006 18:02, Stephane Eranian wrote:
> Andi,
> 
> As a first step, I am looking at implementing a TIF_DEBUG
> for x86-64. AFAIK, debug registers must not be inherited on
> fork().

Why not?  Especially for threads you probably want them
in the new thread too.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-06-30 18:33 Chuck Ebbert
  2006-06-30 18:42 ` Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Chuck Ebbert @ 2006-06-30 18:33 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, Stephane Eranian

In-Reply-To: <200606301541.22928.ak@suse.de>

On Fri, 30 Jun 2006 15:41:22 +0200, Andi Kleen wrote:

> > So why do we need care about context switch in cpu-wide mode?
> > It is because we support a mode where the idle thread is excluded
> > from cpu-wide monitoring. This is very useful to distinguish 
> > 'useful kernel work' from 'idle'. 
> 
> I don't quite see the point because on x86 the PMU doesn't run
> during C states anyways. So you get idle excluded automatically.

Looks like it does run:

$ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
<session to end in 10 seconds>
CPU0     60351837 CPU_CLK_UNHALTED
CPU0    346548229 INTERRUPTS_MASKED_CYCLES

The CPU spent ~60 million clocks unhalted and ~350 million with interrupts
disabled.  (This is an idle 1.6GHz Turion64 machine.)

Now let's see what happens when we exclude the idle thread:

$ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10 --excl-idle
<session to end in 10 seconds>
CPU0    449250 CPU_CLK_UNHALTED
CPU0    161577 INTERRUPTS_MASKED_CYCLES

Looks like excluding the idle thread means interrupts that happen while idle
don't get counted either.  We took 5000 clock interrupts and I know they take
longer than that to process.

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 18:33 Chuck Ebbert
@ 2006-06-30 18:42 ` Andi Kleen
  2006-06-30 18:43 ` Stephane Eranian
  2006-06-30 20:40 ` Stephane Eranian
  2 siblings, 0 replies; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 18:42 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel, Stephane Eranian

On Friday 30 June 2006 20:33, Chuck Ebbert wrote:
> In-Reply-To: <200606301541.22928.ak@suse.de>
> 
> On Fri, 30 Jun 2006 15:41:22 +0200, Andi Kleen wrote:
> 
> > > So why do we need care about context switch in cpu-wide mode?
> > > It is because we support a mode where the idle thread is excluded
> > > from cpu-wide monitoring. This is very useful to distinguish 
> > > 'useful kernel work' from 'idle'. 
> > 
> > I don't quite see the point because on x86 the PMU doesn't run
> > during C states anyways. So you get idle excluded automatically.
> 
> Looks like it does run:

I'm pretty sure it doesn't. You can see it by watching 
the frequency of the perfctr mode NMI watchdog in /proc/interrupts 
under different loads.

When the system is idle the frequency goes down and increases
when the system is busy. I also got confirmation of this behaviour
from both Intel and AMD. C states > 0 are not supposed to run
the performance counters.

Are you sure you didn't boot with poll=idle? Otherwise something must
be wrong with your measurements.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 18:33 Chuck Ebbert
  2006-06-30 18:42 ` Andi Kleen
@ 2006-06-30 18:43 ` Stephane Eranian
  2006-06-30 20:40 ` Stephane Eranian
  2 siblings, 0 replies; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 18:43 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: Andi Kleen, linux-kernel

Chuck,

On Fri, Jun 30, 2006 at 02:33:49PM -0400, Chuck Ebbert wrote:
> In-Reply-To: <200606301541.22928.ak@suse.de>
> 
> On Fri, 30 Jun 2006 15:41:22 +0200, Andi Kleen wrote:
> 
> > > So why do we need care about context switch in cpu-wide mode?
> > > It is because we support a mode where the idle thread is excluded
> > > from cpu-wide monitoring. This is very useful to distinguish 
> > > 'useful kernel work' from 'idle'. 
> > 
> > I don't quite see the point because on x86 the PMU doesn't run
> > during C states anyways. So you get idle excluded automatically.
> 
> Looks like it does run:
> 
> $ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0     60351837 CPU_CLK_UNHALTED
> CPU0    346548229 INTERRUPTS_MASKED_CYCLES
> 
> The CPU spent ~60 million clocks unhalted and ~350 million with interrupts
> disabled.  (This is an idle 1.6GHz Turion64 machine.)
> 
I think it stops counting only CERTAIN events. That's where it becomes difficult
to interpret. There was a defect like this on Itanium 2.

> Now let's see what happens when we exclude the idle thread:
> 
> $ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10 --excl-idle
> <session to end in 10 seconds>
> CPU0    449250 CPU_CLK_UNHALTED
> CPU0    161577 INTERRUPTS_MASKED_CYCLES
> 
> Looks like excluding the idle thread means interrupts that happen while idle
> don't get counted either.  We took 5000 clock interrupts and I know they take
> longer than that to process.
> 

Yes, the way it is implemented today means that nothing happening while the idle
threads runs/sleeps is counted. Monitoring is disabled at context switch boundaries
for task->pid==0.

I would rather have something that would stop counting *EVERYTHING* ONLY when
going low-power OR when busy looping. That's why the idle notififers that Andi mentioned
are interesting. But this would not cover interrupts received while in idle and yet we
want to measure those as they represent useful kernel work. This is unless interrupts while
idle qualifies for an exit_idle() notification.

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-06-30 19:17 Chuck Ebbert
  2006-06-30 19:37 ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Chuck Ebbert @ 2006-06-30 19:17 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Stephane Eranian, linux-kernel

In-Reply-To: <200606302042.23661.ak@suse.de>

On Fri, 30 Jun 2006 20:42:23 +0200. Andi Kleen wrote:

> > > I don't quite see the point because on x86 the PMU doesn't run
> > > during C states anyways. So you get idle excluded automatically.
> > 
> > Looks like it does run:
> 
> I'm pretty sure it doesn't. You can see it by watching 
> the frequency of the perfctr mode NMI watchdog in /proc/interrupts 
> under different loads.
>
> When the system is idle the frequency goes down and increases
> when the system is busy.

But that is using cpu_clk_unhalted (isn't it?)  If so, it would slow down
when the system is idle.

The BIOS writer's guide, Ch. 10.2, says only events outside of the
processor, like northbridge DMA accesses, stop counting during halt.
(And by definition cpu_clk_unhalted.)

> Are you sure you didn't boot with poll=idle?

$ pfmon --smpl-module=inst-hist --smpl-show-function --smpl-show-top=40 \
  -ecpu_clk_unhalted -k --long-smpl-period=10000 --resolve-addr --system-wide -t 10
only kernel symbols are resolved in system-wide mode
<session to end in 10 seconds>
# counts   %self    %cum code address
    2501  85.42%  85.42% __do_softirq<kernel>
     222   7.58%  93.00% acpi_processor_idle<kernel>   <========
     100   3.42%  96.41% ehci_watchdog<kernel>
      39   1.33%  97.75% ehci_hub_status_data<kernel>

I'm pretty sure. :)  Looking at that pile of code in acpi_processor_idle
and the way it disables interrupts I think I'll switch to idle=halt, though.

> Otherwise something must be wrong with your measurements.

In that case it's all Stephane's fault: he wrote the code!

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 19:17 [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch Chuck Ebbert
@ 2006-06-30 19:37 ` Andi Kleen
  0 siblings, 0 replies; 25+ messages in thread
From: Andi Kleen @ 2006-06-30 19:37 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: Stephane Eranian, linux-kernel

> But that is using cpu_clk_unhalted (isn't it?)  If so, it would slow down
> when the system is idle.
> The BIOS writer's guide, Ch. 10.2, says only events outside of the
> processor, 

The other events don't happen by definition. 

If the system is C1 idle there are no cache misses, no pipe line events,
nothing - just cache snoops and waiting for interrupts and TSC
ticking.

> like northbridge DMA accesses, stop counting during halt. 
> (And by definition cpu_clk_unhalted.)

It also depends on which C state and how the BIOS implements your C state.

e.g. there is C1/C2/C3 and then there are various modi of C1
(HLT aka C1 is actually some SMM code in the BIOS that does different
stuff). 

I think there is at least one mode that ramps down large parts of the
CPU (it's called C1 clock ramping - that is what has caused the TSC
sync problems on some dual core systems).

I guess your BIOS is not very aggressive in its SMM code in
disabling the CPU.

C2/C3 also depend on SMM code, but when implemented should definitely
stop everything.

Intel also has different implementations of C1/C2/C3 depending
on CPU and BIOS. Especially lowend code 

But I still maintain something must be wrong with your 
measurements.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 18:33 Chuck Ebbert
  2006-06-30 18:42 ` Andi Kleen
  2006-06-30 18:43 ` Stephane Eranian
@ 2006-06-30 20:40 ` Stephane Eranian
  2 siblings, 0 replies; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 20:40 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: Andi Kleen, linux-kernel

Chuck,

On Fri, Jun 30, 2006 at 02:33:49PM -0400, Chuck Ebbert wrote:
> In-Reply-To: <200606301541.22928.ak@suse.de>
> 
> On Fri, 30 Jun 2006 15:41:22 +0200, Andi Kleen wrote:
> 
> > > So why do we need care about context switch in cpu-wide mode?
> > > It is because we support a mode where the idle thread is excluded
> > > from cpu-wide monitoring. This is very useful to distinguish 
> > > 'useful kernel work' from 'idle'. 
> > 
> > I don't quite see the point because on x86 the PMU doesn't run
> > during C states anyways. So you get idle excluded automatically.
> 
> Looks like it does run:
> 
> $ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0     60351837 CPU_CLK_UNHALTED
> CPU0    346548229 INTERRUPTS_MASKED_CYCLES
> 
> The CPU spent ~60 million clocks unhalted and ~350 million with interrupts
> disabled.  (This is an idle 1.6GHz Turion64 machine.)
> 

As Andi is suggesting, I think this may depends on how the BIOS implements
the low-power state. I have tried the same command on my dual Opteron 250
2.4GHz and I get:
$ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
<session to end in 10 seconds>
CPU0                     9,520,303 CPU_CLK_UNHALTED
CPU0                     3,726,315 INTERRUPTS_MASKED_CYCLES
CPU1                    21,268,151 CPU_CLK_UNHALTED
CPU1                    14,515,389 INTERRUPTS_MASKED_CYCLES

That's without idle=poll.

-- 

-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 17:08                 ` Andi Kleen
@ 2006-06-30 20:47                   ` Stephane Eranian
  0 siblings, 0 replies; 25+ messages in thread
From: Stephane Eranian @ 2006-06-30 20:47 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

On Fri, Jun 30, 2006 at 07:08:03PM +0200, Andi Kleen wrote:
> On Friday 30 June 2006 18:02, Stephane Eranian wrote:
> > Andi,
> > 
> > As a first step, I am looking at implementing a TIF_DEBUG
> > for x86-64. AFAIK, debug registers must not be inherited on
> > fork().
> 
> Why not?  Especially for threads you probably want them
> in the new thread too.
> 
For this to work, the new thread must also inherit the ptrace
flag and ptrace re-parenting stuff. That would happen under
the cover as the ptracing application would not necessarily
know about this. It would also need to be able to name that new
thread (via gettid() and not getpid()) to be able to operate
on it.

In my mind, it has to work the other way around. The ptracing
process interesting in ptracing/debugging new threads, sets the
right ptrace notification options for CLONE, when it gets the
notification it attaches to the newly created thread and reprograms
the breakpoints. This is the way I allow a tool such as pfmon,
to monitor across fork, pthread_create(), for instance.

Of the top of my head, I think for Itanium, we disable breakpoints
for the child in copy_threads(). I don't know for the other architectures.

Anybody can comment on this?

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-07-01 15:21 Chuck Ebbert
  2006-07-04 15:28 ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Chuck Ebbert @ 2006-07-01 15:21 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, Andi Kleen

In-Reply-To: <20060630204032.GB22835@frankl.hpl.hp.com>

On Fri, 30 Jun 2006 13:40:32 -0700, Stephane Eranian wrote:

> As Andi is suggesting, I think this may depends on how the BIOS implements
> the low-power state. I have tried the same command on my dual Opteron 250
> 2.4GHz and I get:
> $ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0                     9,520,303 CPU_CLK_UNHALTED
> CPU0                     3,726,315 INTERRUPTS_MASKED_CYCLES
> CPU1                    21,268,151 CPU_CLK_UNHALTED
> CPU1                    14,515,389 INTERRUPTS_MASKED_CYCLES

That is similar to what I get with idle=halt. Are you not using ACPI
for idle?

Try this:

$ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
<session to end in 10 seconds>
CPU0     95016828 CPU_CLK_UNHALTED
CPU0     36472783 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
CPU0     67484408 INTERRUPTS_MASKED_CYCLES
CPU0    445326968 CYCLES_NO_FPU_OPS_RETIRED

That's what I get with idle=halt.  Since the kernel doesn't do FP
the last line should equal clock cycles.  If it were running at full
speed it would be 16 billion...

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-06-30 12:59     ` Andi Kleen
  2006-06-30 13:29       ` Stephane Eranian
@ 2006-07-03  9:49       ` Stephane Eranian
  2006-07-03 19:25         ` Andi Kleen
  1 sibling, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-07-03  9:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2344 bytes --]

Andi,

Here is a first cut at the patch to simplify the context
switch for the common case and also touch 2 cachelines (instead of 3).
There are 2 new TIF flags. I just tried this on x86_64 but I believe
we could do the same on i386.

Is that what you were thinking about?

Thanks

On Fri, Jun 30, 2006 at 02:59:05PM +0200, Andi Kleen wrote:
> Stephane Eranian <eranian@hpl.hp.com> writes:
> 
> > Andi,
> > 
> > Thanks for your feedback. I will make the changes you
> > requested.
> > 
> > About the context switch code, what about I do the following
> > in __switch_to():
> > 
> > __kprobes struct task_struct *
> > __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> > {
> >         struct thread_struct *prev = &prev_p->thread,
> >                                  *next = &next_p->thread;
> >         int cpu = smp_processor_id();
> >         struct tss_struct *tss = &per_cpu(init_tss, cpu);
> > 
> >         if (unlikely(__get_cpu_var(pmu_ctx) || next_p->pfm_context))
> >                 __pfm_ctxswout(prev_p, next_p);
> > 
> >         /*
> >          * Reload esp0, LDT and the page table pointer:
> >          */
> >         tss->rsp0 = next->rsp0;
> > 
> > There is now a single hook and a conditional branch.
> > this is similar to what you have with the debug registers.
> 
> It's still more than there was before. Also __get_cpu_var 
> is quite a lot of instructions.
> 
> I would suggest you borrow some bits in one of the process
> or thread info flags and then do a single test
> 
> if (unlikely(thr->flags & (DEBUG|PERFMON)) != 0) { 
>         if (flags & DEBUG)
>                 ... do debug ...
>         if (flags & PERFMON)
>                 ... do perfmon ...
> }
> 
> [which you're at it you can probably add ioports in there too -
> improving existing code is always a good thing]
> 
> Ideally flags is in some cache line that is already 
> touched during context switch. If not you might need
> to change the layout.
> 
> It's ok to put the do_perfmon stuff into a separate noinline
> function because that will disturb the register allocation
> in the caller less.
> 
> I would suggest doing this in separate preparing patches that
> first just do it for existing facilities.
> 
> -Andi
> 
> P.S.: My comments probably apply to the i386 versions too
> although I haven't read them.

-- 

-Stephane

[-- Attachment #2: tif.diff --]
[-- Type: text/plain, Size: 6225 bytes --]

diff -urNp linux-2.6.17.2.orig/arch/x86_64/ia32/ptrace32.c linux-2.6.17.2-tif/arch/x86_64/ia32/ptrace32.c
--- linux-2.6.17.2.orig/arch/x86_64/ia32/ptrace32.c	2006-06-17 18:49:35.000000000 -0700
+++ linux-2.6.17.2-tif/arch/x86_64/ia32/ptrace32.c	2006-06-30 09:02:16.000000000 -0700
@@ -118,6 +118,10 @@ static int putreg32(struct task_struct *
 			if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
 			       return -EIO;
 		child->thread.debugreg7 = val; 
+		if (val)
+			set_tsk_thread_flag(child, TIF_DEBUG);
+		else
+			clear_tsk_thread_flag(child, TIF_DEBUG);
 		break; 
 		    
 	default:
diff -urNp linux-2.6.17.2.orig/arch/x86_64/kernel/ioport.c linux-2.6.17.2-tif/arch/x86_64/kernel/ioport.c
--- linux-2.6.17.2.orig/arch/x86_64/kernel/ioport.c	2006-06-17 18:49:35.000000000 -0700
+++ linux-2.6.17.2-tif/arch/x86_64/kernel/ioport.c	2006-07-03 02:06:59.000000000 -0700
@@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long
 
 		memset(bitmap, 0xff, IO_BITMAP_BYTES);
 		t->io_bitmap_ptr = bitmap;
+		set_thread_flag(TIF_IO_BITMAP);
 	}
 
 	/*
diff -urNp linux-2.6.17.2.orig/arch/x86_64/kernel/process.c linux-2.6.17.2-tif/arch/x86_64/kernel/process.c
--- linux-2.6.17.2.orig/arch/x86_64/kernel/process.c	2006-06-17 18:49:35.000000000 -0700
+++ linux-2.6.17.2-tif/arch/x86_64/kernel/process.c	2006-07-03 02:35:25.000000000 -0700
@@ -356,6 +356,7 @@ void exit_thread(void)
 		 */
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
+		clear_thread_flag(TIF_IO_BITMAP);
 		put_cpu();
 	}
 }
@@ -366,7 +367,7 @@ void flush_thread(void)
 	struct thread_info *t = current_thread_info();
 
 	if (t->flags & _TIF_ABI_PENDING)
-		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
+		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32 | _TIF_DEBUG);
 
 	tsk->thread.debugreg0 = 0;
 	tsk->thread.debugreg1 = 0;
@@ -459,7 +460,7 @@ int copy_thread(int nr, unsigned long cl
 	asm("mov %%es,%0" : "=m" (p->thread.es));
 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
 
-	if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
+	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
 			p->thread.io_bitmap_max = 0;
@@ -467,6 +468,7 @@ int copy_thread(int nr, unsigned long cl
 		}
 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 				IO_BITMAP_BYTES);
+		set_tsk_thread_flag(p, TIF_IO_BITMAP);
 	} 
 
 	/*
@@ -484,7 +486,7 @@ int copy_thread(int nr, unsigned long cl
 	}
 	err = 0;
 out:
-	if (err && p->thread.io_bitmap_ptr) {
+	if (err && test_tsk_thread_flag(p, TIF_IO_BITMAP)) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
@@ -584,38 +586,34 @@ __switch_to(struct task_struct *prev_p, 
 		  task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
 
 	/*
-	 * Now maybe reload the debug registers
+	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
-	if (unlikely(next->debugreg7)) {
-		loaddebug(next, 0);
-		loaddebug(next, 1);
-		loaddebug(next, 2);
-		loaddebug(next, 3);
-		/* no 4 and 5 */
-		loaddebug(next, 6);
-		loaddebug(next, 7);
-	}
-
+	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+		|| test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
 
-	/* 
-	 * Handle the IO bitmap 
-	 */ 
-	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
-		if (next->io_bitmap_ptr)
+		if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+			loaddebug(next, 0);
+			loaddebug(next, 1);
+			loaddebug(next, 2);
+			loaddebug(next, 3);
+			/* no 4 and 5 */
+			loaddebug(next, 6);
+			loaddebug(next, 7);
+		}
+		if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 			/*
 			 * Copy the relevant range of the IO bitmap.
 			 * Normally this is 128 bytes or less:
  			 */
 			memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-				max(prev->io_bitmap_max, next->io_bitmap_max));
-		else {
+			       max(prev->io_bitmap_max, next->io_bitmap_max));
+		} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
 			/*
 			 * Clear any possible leftover bits:
 			 */
 			memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 		}
 	}
-
 	return prev_p;
 }
 
diff -urNp linux-2.6.17.2.orig/arch/x86_64/kernel/ptrace.c linux-2.6.17.2-tif/arch/x86_64/kernel/ptrace.c
--- linux-2.6.17.2.orig/arch/x86_64/kernel/ptrace.c	2006-06-17 18:49:35.000000000 -0700
+++ linux-2.6.17.2-tif/arch/x86_64/kernel/ptrace.c	2006-06-30 09:30:57.000000000 -0700
@@ -420,9 +420,16 @@ long arch_ptrace(struct task_struct *chi
 				if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 					break;
 			if (i == 4) {
-				child->thread.debugreg7 = data;
+			  child->thread.debugreg7 = data;
+			  if (data) {
+				pr_info("set TIF_DEBUG for [%d]\n", child->pid);
+			  	set_tsk_thread_flag(child, TIF_DEBUG);
+			  } else {
+				pr_info("clear TIF_DEBUG for [%d]\n", child->pid);
+			  	clear_tsk_thread_flag(child, TIF_DEBUG);
+			}
 			  ret = 0;
-		  }
+		  	}
 		  break;
 		}
 		break;
diff -urNp linux-2.6.17.2.orig/include/asm-x86_64/thread_info.h linux-2.6.17.2-tif/include/asm-x86_64/thread_info.h
--- linux-2.6.17.2.orig/include/asm-x86_64/thread_info.h	2006-06-17 18:49:35.000000000 -0700
+++ linux-2.6.17.2-tif/include/asm-x86_64/thread_info.h	2006-07-03 02:32:25.000000000 -0700
@@ -106,6 +106,8 @@ static inline struct thread_info *stack_
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_ABI_PENDING		19
 #define TIF_MEMDIE		20
+#define TIF_DEBUG		21	/* uses debug registers */
+#define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -119,6 +121,8 @@ static inline struct thread_info *stack_
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+#define _TIF_DEBUG		(1<<TIF_DEBUG)
+#define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -126,6 +130,9 @@ static inline struct thread_info *stack_
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
+/* flags to check in __switch_to() */
+#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+
 #define PREEMPT_ACTIVE     0x10000000
 
 /*

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-07-03 19:25         ` Andi Kleen
@ 2006-07-03 19:22           ` Stephane Eranian
  2006-07-03 19:36             ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stephane Eranian @ 2006-07-03 19:22 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Andi,

On Mon, Jul 03, 2006 at 09:25:48PM +0200, Andi Kleen wrote:
> On Monday 03 July 2006 11:49, Stephane Eranian wrote:
> > Andi,
> > 
> > Here is a first cut at the patch to simplify the context
> > switch for the common case and also touch 2 cachelines (instead of 3).
> > There are 2 new TIF flags. I just tried this on x86_64 but I believe
> > we could do the same on i386.
> > 
> > Is that what you were thinking about?
> 
> Yes, looks good.
> 
I have worked on this some more and I also looked at i386.
They seem to handle the io bitmap differently there. At least
they seem to have some lazy scheme whereby you do not reinstall
if new task is the last owner of tss. Is there a reason for not
having this in place for x86-64?

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-07-03  9:49       ` Stephane Eranian
@ 2006-07-03 19:25         ` Andi Kleen
  2006-07-03 19:22           ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2006-07-03 19:25 UTC (permalink / raw)
  To: eranian; +Cc: linux-kernel

On Monday 03 July 2006 11:49, Stephane Eranian wrote:
> Andi,
> 
> Here is a first cut at the patch to simplify the context
> switch for the common case and also touch 2 cachelines (instead of 3).
> There are 2 new TIF flags. I just tried this on x86_64 but I believe
> we could do the same on i386.
> 
> Is that what you were thinking about?

Yes, looks good.

Thanks,
-Andi


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-07-03 19:22           ` Stephane Eranian
@ 2006-07-03 19:36             ` Andi Kleen
  0 siblings, 0 replies; 25+ messages in thread
From: Andi Kleen @ 2006-07-03 19:36 UTC (permalink / raw)
  To: eranian; +Cc: linux-kernel

On Monday 03 July 2006 21:22, Stephane Eranian wrote:
> Andi,
> 
> On Mon, Jul 03, 2006 at 09:25:48PM +0200, Andi Kleen wrote:
> > On Monday 03 July 2006 11:49, Stephane Eranian wrote:
> > > Andi,
> > > 
> > > Here is a first cut at the patch to simplify the context
> > > switch for the common case and also touch 2 cachelines (instead of 3).
> > > There are 2 new TIF flags. I just tried this on x86_64 but I believe
> > > we could do the same on i386.
> > > 
> > > Is that what you were thinking about?
> > 
> > Yes, looks good.
> > 
> I have worked on this some more and I also looked at i386.
> They seem to handle the io bitmap differently there. At least
> they seem to have some lazy scheme whereby you do not reinstall
> if new task is the last owner of tss. Is there a reason for not
> having this in place for x86-64?

There was a reason but I can't remember it right now :/ Something
in the new scheme didn't fit well or it was just too complicated.

-Andi


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-07-01 15:21 Chuck Ebbert
@ 2006-07-04 15:28 ` Stephane Eranian
  0 siblings, 0 replies; 25+ messages in thread
From: Stephane Eranian @ 2006-07-04 15:28 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel, Andi Kleen

Chuck,

On Sat, Jul 01, 2006 at 11:21:22AM -0400, Chuck Ebbert wrote:
> In-Reply-To: <20060630204032.GB22835@frankl.hpl.hp.com>
> 
> On Fri, 30 Jun 2006 13:40:32 -0700, Stephane Eranian wrote:
> 
> > As Andi is suggesting, I think this may depends on how the BIOS implements
> > the low-power state. I have tried the same command on my dual Opteron 250
> > 2.4GHz and I get:
> > $ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles -k --system-wide -t 10
> > <session to end in 10 seconds>
> > CPU0                     9,520,303 CPU_CLK_UNHALTED
> > CPU0                     3,726,315 INTERRUPTS_MASKED_CYCLES
> > CPU1                    21,268,151 CPU_CLK_UNHALTED
> > CPU1                    14,515,389 INTERRUPTS_MASKED_CYCLES
> 
> That is similar to what I get with idle=halt. Are you not using ACPI
> for idle?
> 
> Try this:
> 
> $ pfmon -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0     95016828 CPU_CLK_UNHALTED
> CPU0     36472783 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> CPU0     67484408 INTERRUPTS_MASKED_CYCLES
> CPU0    445326968 CYCLES_NO_FPU_OPS_RETIRED
> 
> That's what I get with idle=halt.  Since the kernel doesn't do FP
> the last line should equal clock cycles.  If it were running at full
> speed it would be 16 billion...

Here is what I get on my dual 2.4GHz Opteron 250:

booted with idle=halt
$ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
<session to end in 10 seconds>
CPU0                    11,356,210 CPU_CLK_UNHALTED                               
CPU0                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
CPU0                     3,836,107 INTERRUPTS_MASKED_CYCLES                       
CPU0                23,910,784,532 CYCLES_NO_FPU_OPS_RETIRED                      
CPU1                    19,303,632 CPU_CLK_UNHALTED                               
CPU1                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
CPU1                    13,942,265 INTERRUPTS_MASKED_CYCLES                       
CPU1                23,911,872,654 CYCLES_NO_FPU_OPS_RETIRED                      

As you can see, CYCLES_NO_FPU_OPS_RETIRED on each CPU is as expected  for 10s.

booted with idle=poll
$ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
<session to end in 10 seconds>
CPU0                23,906,091,982 CPU_CLK_UNHALTED                               
CPU0                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
CPU0                     3,771,569 INTERRUPTS_MASKED_CYCLES                       
CPU0                23,906,090,750 CYCLES_NO_FPU_OPS_RETIRED                      
CPU1                23,906,629,241 CPU_CLK_UNHALTED                               
CPU1                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
CPU1                    14,805,078 INTERRUPTS_MASKED_CYCLES                       
CPU1                23,906,194,343 CYCLES_NO_FPU_OPS_RETIRED                      

CYCLES_NO_FPU_OPS_RETIRED is as expected and, in this case, it is equal to CPU_CLK_UNHALTED
because we are busy looping.

If I don't specify anything, I get like idle=halt which is expected.

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
@ 2006-07-06 17:30 Chuck Ebbert
  2006-07-06 20:16 ` Stephane Eranian
  0 siblings, 1 reply; 25+ messages in thread
From: Chuck Ebbert @ 2006-07-06 17:30 UTC (permalink / raw)
  To: eranian@hpl.hp.com; +Cc: Andi Kleen, linux-kernel

In-Reply-To: <20060704152857.GA6999@frankl.hpl.hp.com>

On Tue, 4 Jul 2006 08:28:57 -0700, Stephan Eranian wrote:

> Here is what I get on my dual 2.4GHz Opteron 250:
> 
> booted with idle=halt
> $ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
> <session to end in 10 seconds>
> CPU0                    11,356,210 CPU_CLK_UNHALTED                               
> CPU0                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> CPU0                     3,836,107 INTERRUPTS_MASKED_CYCLES                       
> CPU0                23,910,784,532 CYCLES_NO_FPU_OPS_RETIRED                      
> CPU1                    19,303,632 CPU_CLK_UNHALTED                               
> CPU1                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> CPU1                    13,942,265 INTERRUPTS_MASKED_CYCLES                       
> CPU1                23,911,872,654 CYCLES_NO_FPU_OPS_RETIRED                      

So it looks like your Opteron continues to count CYCLES_NO_FPU_OPS_RETIRED
at full clock speed even when halted.

My Turion appears to slow down to ~40 MHz when halted and counts those
events at that speed.  Using idle=poll shows no slowdown, as expected,
and unhalted clocks equals cycles_no_fpu_ops_retired.

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch
  2006-07-06 17:30 Chuck Ebbert
@ 2006-07-06 20:16 ` Stephane Eranian
  0 siblings, 0 replies; 25+ messages in thread
From: Stephane Eranian @ 2006-07-06 20:16 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: Andi Kleen, linux-kernel

Chuck,

On Thu, Jul 06, 2006 at 01:30:14PM -0400, Chuck Ebbert wrote:
> In-Reply-To: <20060704152857.GA6999@frankl.hpl.hp.com>
> 
> On Tue, 4 Jul 2006 08:28:57 -0700, Stephan Eranian wrote:
> 
> > Here is what I get on my dual 2.4GHz Opteron 250:
> > 
> > booted with idle=halt
> > $ pfmon --us-c -ecpu_clk_unhalted,interrupts_masked_cycles_with_interrupt_pending,interrupts_masked_cycles,cycles_no_fpu_ops_retired -k --system-wide -t 10
> > <session to end in 10 seconds>
> > CPU0                    11,356,210 CPU_CLK_UNHALTED                               
> > CPU0                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> > CPU0                     3,836,107 INTERRUPTS_MASKED_CYCLES                       
> > CPU0                23,910,784,532 CYCLES_NO_FPU_OPS_RETIRED                      
> > CPU1                    19,303,632 CPU_CLK_UNHALTED                               
> > CPU1                             0 INTERRUPTS_MASKED_CYCLES_WITH_INTERRUPT_PENDING
> > CPU1                    13,942,265 INTERRUPTS_MASKED_CYCLES                       
> > CPU1                23,911,872,654 CYCLES_NO_FPU_OPS_RETIRED                      
> 
> So it looks like your Opteron continues to count CYCLES_NO_FPU_OPS_RETIRED
> at full clock speed even when halted.
> 

Yes, it certainly looks like it, idling at full clock speed. But that's a server CPU!
I think this may have to do with latency to get back to useful work.


> My Turion appears to slow down to ~40 MHz when halted and counts those
> events at that speed.  Using idle=poll shows no slowdown, as expected,
> and unhalted clocks equals cycles_no_fpu_ops_retired.
> 
That's a mobile processor, so I would expect it to use more aggressive
power-saving scheme while idle, at the cost of higher latency to get
back to full speed. 

The good thing about this is that it was not too hard to figure out what is
going using pfmon. Where you specifically looking for this? Is this a documented
behavior of the Turion64?

-- 
-Stephane

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2006-07-06 20:24 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-30 19:17 [PATCH 10/17] 2.6.17.1 perfmon2 patch for review: PMU context switch Chuck Ebbert
2006-06-30 19:37 ` Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2006-07-06 17:30 Chuck Ebbert
2006-07-06 20:16 ` Stephane Eranian
2006-07-01 15:21 Chuck Ebbert
2006-07-04 15:28 ` Stephane Eranian
2006-06-30 18:33 Chuck Ebbert
2006-06-30 18:42 ` Andi Kleen
2006-06-30 18:43 ` Stephane Eranian
2006-06-30 20:40 ` Stephane Eranian
2006-06-23  9:13 Stephane Eranian
2006-06-30 12:27 ` Andi Kleen
2006-06-30 12:36   ` Stephane Eranian
2006-06-30 12:59     ` Andi Kleen
2006-06-30 13:29       ` Stephane Eranian
2006-06-30 13:41         ` Andi Kleen
2006-06-30 14:12           ` Stephane Eranian
2006-06-30 14:33             ` Andi Kleen
2006-06-30 16:02               ` Stephane Eranian
2006-06-30 17:08                 ` Andi Kleen
2006-06-30 20:47                   ` Stephane Eranian
2006-07-03  9:49       ` Stephane Eranian
2006-07-03 19:25         ` Andi Kleen
2006-07-03 19:22           ` Stephane Eranian
2006-07-03 19:36             ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox