* [PATCH 0/4] generic software counters
@ 2009-03-10 14:19 Peter Zijlstra
2009-03-10 14:19 ` [PATCH 1/4] perf_counter: software counter event infrastructure Peter Zijlstra
` (4 more replies)
0 siblings, 5 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 14:19 UTC (permalink / raw)
To: mingo, paulus; +Cc: linux-kernel, Peter Zijlstra
--
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/4] perf_counter: software counter event infrastructure
2009-03-10 14:19 [PATCH 0/4] generic software counters Peter Zijlstra
@ 2009-03-10 14:19 ` Peter Zijlstra
2009-03-10 14:19 ` [PATCH 2/4] perf_counter: provide pagefault software events Peter Zijlstra
` (3 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 14:19 UTC (permalink / raw)
To: mingo, paulus; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_swcounter.patch --]
[-- Type: text/plain, Size: 8070 bytes --]
Provide generic software counter infrastructure that supports software events.
This will be used to allow sample based profiling based on software events
such as pagefaults. The current infrastructure can only provide a count
of such events, no place information.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/perf_counter.h | 8 +
kernel/perf_counter.c | 206 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 214 insertions(+)
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -126,6 +126,7 @@ struct hw_perf_counter {
unsigned long counter_base;
int nmi;
unsigned int idx;
+ atomic64_t count; /* software */
atomic64_t prev_count;
u64 irq_period;
atomic64_t period_left;
@@ -283,6 +284,8 @@ static inline int is_software_counter(st
return !counter->hw_event.raw && counter->hw_event.type < 0;
}
+extern void perf_swcounter_event(enum hw_event_types, u64, struct pt_regs *);
+
#else
static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
@@ -299,6 +302,11 @@ static inline void hw_perf_restore(u64 c
static inline u64 hw_perf_save_disable(void) { return 0; }
static inline int perf_counter_task_disable(void) { return -EINVAL; }
static inline int perf_counter_task_enable(void) { return -EINVAL; }
+
+static inline void
+perf_swcounter_event(enum hw_event_types event, u64 nr, struct pt_regs *regs)
+{
+}
#endif
#endif /* __KERNEL__ */
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1330,6 +1330,187 @@ static const struct file_operations perf
.compat_ioctl = perf_ioctl,
};
+/*
+ * Generic software counter infrastructure
+ */
+
+static void perf_swcounter_update(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 prev, now;
+ s64 delta;
+
+again:
+ prev = atomic64_read(&hwc->prev_count);
+ now = atomic64_read(&hwc->count);
+ if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
+ goto again;
+
+ delta = now - prev;
+
+ atomic64_add(delta, &counter->count);
+ atomic64_add(delta, &hwc->period_left);
+}
+
+static void perf_swcounter_set_period(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ s64 left = atomic64_read(&hwc->period_left);
+ s64 period = hwc->irq_period;
+
+ if (unlikely(left <= -period)) {
+ left = period;
+ atomic64_set(&hwc->period_left, left);
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ atomic64_add(period, &hwc->period_left);
+ }
+
+ atomic64_set(&hwc->prev_count, -left);
+ atomic64_set(&hwc->count, -left);
+}
+
+static void perf_swcounter_save_and_restart(struct perf_counter *counter)
+{
+ perf_swcounter_update(counter);
+ perf_swcounter_set_period(counter);
+}
+
+static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
+{
+ struct perf_data *irqdata = counter->irqdata;
+
+ if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+ irqdata->overrun++;
+ } else {
+ u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+ *p = data;
+ irqdata->len += sizeof(u64);
+ }
+}
+
+static void perf_swcounter_handle_group(struct perf_counter *sibling)
+{
+ struct perf_counter *counter, *group_leader = sibling->group_leader;
+
+ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+ perf_swcounter_update(counter);
+ perf_swcounter_store_irq(sibling, counter->hw_event.type);
+ perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
+ }
+}
+
+static void
+perf_swcounter_interrupt(struct perf_counter *counter, struct pt_regs *regs)
+{
+ perf_swcounter_save_and_restart(counter);
+
+ switch (counter->hw_event.record_type) {
+ case PERF_RECORD_SIMPLE:
+ break;
+
+ case PERF_RECORD_IRQ:
+ perf_swcounter_store_irq(counter, instruction_pointer(regs));
+ break;
+
+ case PERF_RECORD_GROUP:
+ perf_swcounter_handle_group(counter);
+ break;
+ }
+
+ wake_up(&counter->waitq);
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+ enum hw_event_types event,
+ struct pt_regs *regs)
+{
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return 0;
+
+ if (counter->hw_event.raw)
+ return 0;
+
+ if (counter->hw_event.type != event)
+ return 0;
+
+ if (counter->hw_event.exclude_user && user_mode(regs))
+ return 0;
+
+ if (counter->hw_event.exclude_kernel && !user_mode(regs))
+ return 0;
+
+ return 1;
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+ enum hw_event_types event, u64 nr,
+ struct pt_regs *regs)
+{
+ struct perf_counter *counter;
+ unsigned long flags;
+
+ if (list_empty(&ctx->counter_list))
+ return;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ /*
+ * XXX: make counter_list RCU safe
+ */
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (perf_swcounter_match(counter, event, regs) &&
+ !atomic64_add_negative(nr, &counter->hw.count))
+ perf_swcounter_interrupt(counter, regs);
+ }
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+void
+perf_swcounter_event(enum hw_event_types event, u64 nr, struct pt_regs *regs)
+{
+ struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+
+ perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, regs);
+ if (cpuctx->task_ctx)
+ perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, regs);
+
+ put_cpu_var(perf_cpu_context);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+ perf_swcounter_update(counter);
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (counter->prev_state <= PERF_COUNTER_STATE_OFF) {
+ atomic64_set(&counter->count, 0);
+ atomic64_set(&hwc->count, 0);
+ atomic64_set(&hwc->prev_count, 0);
+ }
+
+ perf_swcounter_set_period(counter);
+
+ return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+ perf_swcounter_update(counter);
+}
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
{
int cpu = raw_smp_processor_id();
@@ -1367,6 +1548,10 @@ static const struct hw_perf_counter_ops
};
/*
+ * Software counter: task time clock
+ */
+
+/*
* Called from within the scheduler:
*/
static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
@@ -1422,6 +1607,10 @@ static const struct hw_perf_counter_ops
.read = task_clock_perf_counter_read,
};
+/*
+ * Software counter: page faults
+ */
+
#ifdef CONFIG_VM_EVENT_COUNTERS
#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT]
#else
@@ -1475,6 +1664,10 @@ static const struct hw_perf_counter_ops
.read = page_faults_perf_counter_read,
};
+/*
+ * Software counter: context switches
+ */
+
static u64 get_context_switches(struct perf_counter *counter)
{
struct task_struct *curr = counter->ctx->task;
@@ -1523,6 +1716,10 @@ static const struct hw_perf_counter_ops
.read = context_switches_perf_counter_read,
};
+/*
+ * Software counter: cpu migrations
+ */
+
static inline u64 get_cpu_migrations(struct perf_counter *counter)
{
struct task_struct *curr = counter->ctx->task;
@@ -1574,7 +1771,9 @@ static const struct hw_perf_counter_ops
static const struct hw_perf_counter_ops *
sw_perf_counter_init(struct perf_counter *counter)
{
+ struct perf_counter_hw_event *hw_event = &counter->hw_event;
const struct hw_perf_counter_ops *hw_ops = NULL;
+ struct hw_perf_counter *hwc = &counter->hw;
/*
* Software counters (currently) can't in general distinguish
@@ -1620,6 +1819,13 @@ sw_perf_counter_init(struct perf_counter
default:
break;
}
+
+ if (hw_ops) {
+ hwc->irq_period = hw_event->irq_period;
+ atomic64_set(&hwc->period_left, hwc->irq_period);
+ counter->wakeup_pending = 0;
+ }
+
return hw_ops;
}
--
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 2/4] perf_counter: provide pagefault software events
2009-03-10 14:19 [PATCH 0/4] generic software counters Peter Zijlstra
2009-03-10 14:19 ` [PATCH 1/4] perf_counter: software counter event infrastructure Peter Zijlstra
@ 2009-03-10 14:19 ` Peter Zijlstra
2009-03-10 14:59 ` Ingo Molnar
2009-03-10 14:19 ` [PATCH 3/4] perf_counter: x86: fix 32bit irq_period assumption Peter Zijlstra
` (2 subsequent siblings)
4 siblings, 1 reply; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 14:19 UTC (permalink / raw)
To: mingo, paulus; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_swcounter_pgfault.patch --]
[-- Type: text/plain, Size: 3515 bytes --]
We use the generic software counter infrastructure to provide page fault
events.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/powerpc/mm/fault.c | 3 ++
arch/x86/mm/fault.c | 3 ++
kernel/perf_counter.c | 53 ++----------------------------------------------
3 files changed, 9 insertions(+), 50 deletions(-)
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1601,57 +1601,10 @@ static const struct hw_perf_counter_ops
* Software counter: page faults
*/
-#ifdef CONFIG_VM_EVENT_COUNTERS
-#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT]
-#else
-#define cpu_page_faults() 0
-#endif
-
-static u64 get_page_faults(struct perf_counter *counter)
-{
- struct task_struct *curr = counter->ctx->task;
-
- if (curr)
- return curr->maj_flt + curr->min_flt;
- return cpu_page_faults();
-}
-
-static void page_faults_perf_counter_update(struct perf_counter *counter)
-{
- u64 prev, now;
- s64 delta;
-
- prev = atomic64_read(&counter->hw.prev_count);
- now = get_page_faults(counter);
-
- atomic64_set(&counter->hw.prev_count, now);
-
- delta = now - prev;
-
- atomic64_add(delta, &counter->count);
-}
-
-static void page_faults_perf_counter_read(struct perf_counter *counter)
-{
- page_faults_perf_counter_update(counter);
-}
-
-static int page_faults_perf_counter_enable(struct perf_counter *counter)
-{
- if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
- atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
- return 0;
-}
-
-static void page_faults_perf_counter_disable(struct perf_counter *counter)
-{
- page_faults_perf_counter_update(counter);
-}
-
static const struct hw_perf_counter_ops perf_ops_page_faults = {
- .enable = page_faults_perf_counter_enable,
- .disable = page_faults_perf_counter_disable,
- .read = page_faults_perf_counter_read,
+ .enable = perf_swcounter_enable,
+ .disable = perf_swcounter_disable,
+ .read = perf_swcounter_read,
};
/*
Index: linux-2.6/arch/powerpc/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/powerpc/mm/fault.c
+++ linux-2.6/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/perf_counter.h>
#include <asm/firmware.h>
#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_re
die("Weird page fault", regs, SIGSEGV);
}
+ perf_swcount_event(PERF_COUNT_PAGE_FAULTS, 1, regs);
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
Index: linux-2.6/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/fault.c
+++ linux-2.6/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
#include <linux/tty.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/perf_counter.h>
#include <asm-generic/sections.h>
@@ -1056,6 +1057,8 @@ do_page_fault(struct pt_regs *regs, unsi
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
+ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, regs);
+
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
--
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 3/4] perf_counter: x86: fix 32bit irq_period assumption.
2009-03-10 14:19 [PATCH 0/4] generic software counters Peter Zijlstra
2009-03-10 14:19 ` [PATCH 1/4] perf_counter: software counter event infrastructure Peter Zijlstra
2009-03-10 14:19 ` [PATCH 2/4] perf_counter: provide pagefault software events Peter Zijlstra
@ 2009-03-10 14:19 ` Peter Zijlstra
2009-03-10 14:19 ` [PATCH 4/4] perf_counter: use list_move_tail Peter Zijlstra
2009-03-10 14:35 ` [PATCH 0/4] generic software counters Peter Zijlstra
4 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 14:19 UTC (permalink / raw)
To: mingo, paulus; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-x86.patch --]
[-- Type: text/plain, Size: 666 bytes --]
No need to assume the irq_period is 32bit.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/x86/kernel/cpu/perf_counter.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf
struct hw_perf_counter *hwc, int idx)
{
s64 left = atomic64_read(&hwc->period_left);
- s32 period = hwc->irq_period;
+ s64 period = hwc->irq_period;
int err;
/*
--
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 4/4] perf_counter: use list_move_tail
2009-03-10 14:19 [PATCH 0/4] generic software counters Peter Zijlstra
` (2 preceding siblings ...)
2009-03-10 14:19 ` [PATCH 3/4] perf_counter: x86: fix 32bit irq_period assumption Peter Zijlstra
@ 2009-03-10 14:19 ` Peter Zijlstra
2009-03-10 14:35 ` [PATCH 0/4] generic software counters Peter Zijlstra
4 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 14:19 UTC (permalink / raw)
To: mingo, paulus; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-list_op.patch --]
[-- Type: text/plain, Size: 722 bytes --]
Instead of del/add use a move list-op.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/perf_counter.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -89,8 +89,7 @@ list_del_counter(struct perf_counter *co
list_for_each_entry_safe(sibling, tmp,
&counter->sibling_list, list_entry) {
- list_del_init(&sibling->list_entry);
- list_add_tail(&sibling->list_entry, &ctx->counter_list);
+ list_move_tail(&sibling->list_entry, &ctx->counter_list);
sibling->group_leader = sibling;
}
}
--
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 0/4] generic software counters
2009-03-10 14:19 [PATCH 0/4] generic software counters Peter Zijlstra
` (3 preceding siblings ...)
2009-03-10 14:19 ` [PATCH 4/4] perf_counter: use list_move_tail Peter Zijlstra
@ 2009-03-10 14:35 ` Peter Zijlstra
4 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 14:35 UTC (permalink / raw)
To: mingo; +Cc: paulus, linux-kernel
On Tue, 2009-03-10 at 15:19 +0100, Peter Zijlstra wrote:
Right, this was supposed to say RFC, its been compiled, but not much
more.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/4] perf_counter: provide pagefault software events
2009-03-10 14:19 ` [PATCH 2/4] perf_counter: provide pagefault software events Peter Zijlstra
@ 2009-03-10 14:59 ` Ingo Molnar
2009-03-10 15:06 ` Peter Zijlstra
0 siblings, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:59 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: paulus, linux-kernel
* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> Index: linux-2.6/arch/powerpc/mm/fault.c
> ===================================================================
> --- linux-2.6.orig/arch/powerpc/mm/fault.c
> +++ linux-2.6/arch/powerpc/mm/fault.c
> @@ -29,6 +29,7 @@
> #include <linux/module.h>
> #include <linux/kprobes.h>
> #include <linux/kdebug.h>
> +#include <linux/perf_counter.h>
>
> #include <asm/firmware.h>
> #include <asm/page.h>
> @@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_re
> die("Weird page fault", regs, SIGSEGV);
> }
>
> + perf_swcount_event(PERF_COUNT_PAGE_FAULTS, 1, regs);
Wow, that's really nice - this way we can display a profile of
pagefault events in KernelTop - as if it was a real cycles or
other hw event based profile. Have you tried it out?
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/4] perf_counter: provide pagefault software events
2009-03-10 14:59 ` Ingo Molnar
@ 2009-03-10 15:06 ` Peter Zijlstra
0 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-03-10 15:06 UTC (permalink / raw)
To: Ingo Molnar; +Cc: paulus, linux-kernel
On Tue, 2009-03-10 at 15:59 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> > Index: linux-2.6/arch/powerpc/mm/fault.c
> > ===================================================================
> > --- linux-2.6.orig/arch/powerpc/mm/fault.c
> > +++ linux-2.6/arch/powerpc/mm/fault.c
> > @@ -29,6 +29,7 @@
> > #include <linux/module.h>
> > #include <linux/kprobes.h>
> > #include <linux/kdebug.h>
> > +#include <linux/perf_counter.h>
> >
> > #include <asm/firmware.h>
> > #include <asm/page.h>
> > @@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_re
> > die("Weird page fault", regs, SIGSEGV);
> > }
> >
> > + perf_swcount_event(PERF_COUNT_PAGE_FAULTS, 1, regs);
>
> Wow, that's really nice - this way we can display a profile of
> pagefault events in KernelTop - as if it was a real cycles or
> other hw event based profile. Have you tried it out?
Yeah, but unsurprising all pagefaults I generated were in userspace, so
kerneltop stayed empty.
Found that perfstat went bokers though, forgot to handle the case where
there's no overflow interrupt.
/me goes poke at the code a bit more.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2009-03-10 15:06 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-03-10 14:19 [PATCH 0/4] generic software counters Peter Zijlstra
2009-03-10 14:19 ` [PATCH 1/4] perf_counter: software counter event infrastructure Peter Zijlstra
2009-03-10 14:19 ` [PATCH 2/4] perf_counter: provide pagefault software events Peter Zijlstra
2009-03-10 14:59 ` Ingo Molnar
2009-03-10 15:06 ` Peter Zijlstra
2009-03-10 14:19 ` [PATCH 3/4] perf_counter: x86: fix 32bit irq_period assumption Peter Zijlstra
2009-03-10 14:19 ` [PATCH 4/4] perf_counter: use list_move_tail Peter Zijlstra
2009-03-10 14:35 ` [PATCH 0/4] generic software counters Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox