* [PATCH 0/4] perf_counter bits
@ 2009-05-01 10:23 Peter Zijlstra
2009-05-01 10:23 ` [PATCH 1/4] perf_counter: fix race in perf_output_* Peter Zijlstra
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-01 10:23 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra
- fixes a race in the output code
- x86: fixes a hang in nmi_watchdog=2 vs perf_counters
- teaches perf-report to handle 0-length files
- updates the documentation
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/4] perf_counter: fix race in perf_output_*
2009-05-01 10:23 [PATCH 0/4] perf_counter bits Peter Zijlstra
@ 2009-05-01 10:23 ` Peter Zijlstra
2009-05-01 13:27 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-01 10:23 ` [PATCH 2/4] perf_counter: fix nmi-watchdog interaction Peter Zijlstra
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-01 10:23 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-output-race.patch --]
[-- Type: text/plain, Size: 6696 bytes --]
When two (or more) contexts output to the same buffer, it is possible
to observe half written output.
Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing
perf_counter_overflow(). If CPU1 does a wakeup and exposes head to
user-space, then CPU2 can observe the data CPU0 is still writing.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/perf_counter.h | 5 +
kernel/perf_counter.c | 130 +++++++++++++++++++++++++++++++++----------
2 files changed, 105 insertions(+), 30 deletions(-)
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -358,10 +358,13 @@ struct perf_mmap_data {
struct rcu_head rcu_head;
int nr_pages; /* nr of data pages */
- atomic_t wakeup; /* POLL_ for wakeups */
+ atomic_t poll; /* POLL_ for wakeups */
atomic_t head; /* write position */
atomic_t events; /* event limit */
+ atomic_t wakeup_head; /* completed head */
+ atomic_t lock; /* concurrent writes */
+
struct perf_counter_mmap_page *user_page;
void *data_pages[0];
};
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct fil
{
struct perf_counter *counter = file->private_data;
struct perf_mmap_data *data;
- unsigned int events;
+ unsigned int events = POLL_HUP;
rcu_read_lock();
data = rcu_dereference(counter->data);
if (data)
- events = atomic_xchg(&data->wakeup, 0);
- else
- events = POLL_HUP;
+ events = atomic_xchg(&data->poll, 0);
rcu_read_unlock();
poll_wait(file, &counter->waitq, wait);
@@ -1568,22 +1566,6 @@ static const struct file_operations perf
void perf_counter_wakeup(struct perf_counter *counter)
{
- struct perf_mmap_data *data;
-
- rcu_read_lock();
- data = rcu_dereference(counter->data);
- if (data) {
- atomic_set(&data->wakeup, POLL_IN);
- /*
- * Ensure all data writes are issued before updating the
- * user-space data head information. The matching rmb()
- * will be in userspace after reading this value.
- */
- smp_wmb();
- data->user_page->data_head = atomic_read(&data->head);
- }
- rcu_read_unlock();
-
wake_up_all(&counter->waitq);
if (counter->pending_kill) {
@@ -1721,10 +1703,14 @@ struct perf_output_handle {
int wakeup;
int nmi;
int overflow;
+ int locked;
+ unsigned long flags;
};
-static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+static void perf_output_wakeup(struct perf_output_handle *handle)
{
+ atomic_set(&handle->data->poll, POLL_IN);
+
if (handle->nmi) {
handle->counter->pending_wakeup = 1;
perf_pending_queue(&handle->counter->pending,
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(
perf_counter_wakeup(handle->counter);
}
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ int cpu;
+
+ handle->locked = 0;
+
+ local_irq_save(handle->flags);
+ cpu = smp_processor_id();
+
+ if (in_nmi() && atomic_read(&data->lock) == cpu)
+ return;
+
+ while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+ cpu_relax();
+
+ handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ int head, cpu;
+
+ if (handle->wakeup)
+ data->wakeup_head = data->head;
+
+ if (!handle->locked)
+ goto out;
+
+again:
+ /*
+ * The xchg implies a full barrier that ensures all writes are done
+ * before we publish the new head, matched by a rmb() in userspace when
+ * reading this position.
+ */
+ while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+ data->user_page->data_head = head;
+ handle->wakeup = 1;
+ }
+
+ /*
+ * NMI can happen here, which means we can miss a wakeup_head update.
+ */
+
+ cpu = atomic_xchg(&data->lock, 0);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ /*
+ * Therefore we have to validate we did not indeed do so.
+ */
+ if (unlikely(atomic_read(&data->wakeup_head))) {
+ /*
+ * Since we had it locked, we can lock it again.
+ */
+ while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+ cpu_relax();
+
+ goto again;
+ }
+
+ if (handle->wakeup)
+ perf_output_wakeup(handle);
+out:
+ local_irq_restore(handle->flags);
+}
+
static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size,
int nmi, int overflow)
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf
if (!data)
goto out;
+ handle->data = data;
handle->counter = counter;
handle->nmi = nmi;
handle->overflow = overflow;
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf
if (!data->nr_pages)
goto fail;
+ perf_output_lock(handle);
+
do {
offset = head = atomic_read(&data->head);
head += size;
} while (atomic_cmpxchg(&data->head, offset, head) != offset);
- handle->data = data;
handle->offset = offset;
handle->head = head;
handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf
return 0;
fail:
- __perf_output_wakeup(handle);
+ perf_output_wakeup(handle);
out:
rcu_read_unlock();
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf
static void perf_output_end(struct perf_output_handle *handle)
{
- int wakeup_events = handle->counter->hw_event.wakeup_events;
+ struct perf_counter *counter = handle->counter;
+ struct perf_mmap_data *data = handle->data;
+
+ int wakeup_events = counter->hw_event.wakeup_events;
if (handle->overflow && wakeup_events) {
- int events = atomic_inc_return(&handle->data->events);
+ int events = atomic_inc_return(&data->events);
if (events >= wakeup_events) {
- atomic_sub(wakeup_events, &handle->data->events);
- __perf_output_wakeup(handle);
+ atomic_sub(wakeup_events, &data->events);
+ handle->wakeup = 1;
}
- } else if (handle->wakeup)
- __perf_output_wakeup(handle);
+ }
+
+ perf_output_unlock(handle);
rcu_read_unlock();
}
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/4] perf_counter: fix nmi-watchdog interaction
2009-05-01 10:23 [PATCH 0/4] perf_counter bits Peter Zijlstra
2009-05-01 10:23 ` [PATCH 1/4] perf_counter: fix race in perf_output_* Peter Zijlstra
@ 2009-05-01 10:23 ` Peter Zijlstra
2009-05-01 13:27 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-01 10:23 ` [PATCH 3/4] perf_counter: tool: handle 0-length data files Peter Zijlstra
2009-05-01 10:23 ` [PATCH 4/4] perf_counter: documetation update Peter Zijlstra
3 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-01 10:23 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-fix-nmi.patch --]
[-- Type: text/plain, Size: 670 bytes --]
When we don't have any perf-counters active, don't act like we know
what the NMI is for.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/x86/kernel/cpu/perf_counter.c | 3 +++
1 file changed, 3 insertions(+)
Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -891,6 +891,9 @@ perf_counter_nmi_handler(struct notifier
struct pt_regs *regs;
int ret;
+ if (!atomic_read(&num_counters))
+ return NOTIFY_DONE;
+
switch (cmd) {
case DIE_NMI:
case DIE_NMI_IPI:
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/4] perf_counter: tool: handle 0-length data files
2009-05-01 10:23 [PATCH 0/4] perf_counter bits Peter Zijlstra
2009-05-01 10:23 ` [PATCH 1/4] perf_counter: fix race in perf_output_* Peter Zijlstra
2009-05-01 10:23 ` [PATCH 2/4] perf_counter: fix nmi-watchdog interaction Peter Zijlstra
@ 2009-05-01 10:23 ` Peter Zijlstra
2009-05-01 13:27 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-01 10:23 ` [PATCH 4/4] perf_counter: documetation update Peter Zijlstra
3 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-01 10:23 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-report-zero-file.patch --]
[-- Type: text/plain, Size: 650 bytes --]
Avoid perf-report barfing on 0-length data files.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Documentation/perf_counter/perf-report.cc | 5 +++++
1 file changed, 5 insertions(+)
Index: linux-2.6/Documentation/perf_counter/perf-report.cc
===================================================================
--- linux-2.6.orig/Documentation/perf_counter/perf-report.cc
+++ linux-2.6/Documentation/perf_counter/perf-report.cc
@@ -402,6 +402,11 @@ int main(int argc, char *argv[])
exit(-1);
}
+ if (!stat.st_size) {
+ fprintf(stderr, "zero-sized file, nothing to do!\n");
+ exit(0);
+ }
+
load_kallsyms();
remap:
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 4/4] perf_counter: documetation update
2009-05-01 10:23 [PATCH 0/4] perf_counter bits Peter Zijlstra
` (2 preceding siblings ...)
2009-05-01 10:23 ` [PATCH 3/4] perf_counter: tool: handle 0-length data files Peter Zijlstra
@ 2009-05-01 10:23 ` Peter Zijlstra
2009-05-01 13:28 ` [tip:perfcounters/core] perf_counter: documentation update tip-bot for Peter Zijlstra
3 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-01 10:23 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-doc.patch --]
[-- Type: text/plain, Size: 13254 bytes --]
Update the documentation to reflect the current state of affairs
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Documentation/perf_counter/design.txt | 272 +++++++++++++++++++++++++++-------
1 file changed, 219 insertions(+), 53 deletions(-)
Index: linux-2.6/Documentation/perf_counter/design.txt
===================================================================
--- linux-2.6.orig/Documentation/perf_counter/design.txt
+++ linux-2.6/Documentation/perf_counter/design.txt
@@ -34,41 +34,47 @@ can be poll()ed.
When creating a new counter fd, 'perf_counter_hw_event' is:
-/*
- * Event to monitor via a performance monitoring counter:
- */
struct perf_counter_hw_event {
- __u64 event_config;
-
- __u64 irq_period;
- __u64 record_type;
- __u64 read_format;
+ /*
+ * The MSB of the config word signifies if the rest contains cpu
+ * specific (raw) counter configuration data, if unset, the next
+ * 7 bits are an event type and the rest of the bits are the event
+ * identifier.
+ */
+ __u64 config;
+
+ __u64 irq_period;
+ __u32 record_type;
+ __u32 read_format;
+
+ __u64 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ inherit : 1, /* children inherit it */
+ pinned : 1, /* must always be on PMU */
+ exclusive : 1, /* only group on PMU */
+ exclude_user : 1, /* don't count user */
+ exclude_kernel : 1, /* ditto kernel */
+ exclude_hv : 1, /* ditto hypervisor */
+ exclude_idle : 1, /* don't count when idle */
+ mmap : 1, /* include mmap data */
+ munmap : 1, /* include munmap data */
+ comm : 1, /* include comm data */
- __u64 disabled : 1, /* off by default */
- nmi : 1, /* NMI sampling */
- inherit : 1, /* children inherit it */
- pinned : 1, /* must always be on PMU */
- exclusive : 1, /* only group on PMU */
- exclude_user : 1, /* don't count user */
- exclude_kernel : 1, /* ditto kernel */
- exclude_hv : 1, /* ditto hypervisor */
- exclude_idle : 1, /* don't count when idle */
+ __reserved_1 : 52;
- __reserved_1 : 55;
+ __u32 extra_config_len;
+ __u32 wakeup_events; /* wakeup every n events */
- __u32 extra_config_len;
-
- __u32 __reserved_4;
- __u64 __reserved_2;
- __u64 __reserved_3;
+ __u64 __reserved_2;
+ __u64 __reserved_3;
};
-The 'event_config' field specifies what the counter should count. It
+The 'config' field specifies what the counter should count. It
is divided into 3 bit-fields:
-raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
-type: 7 bits (next most significant) 0x7f00_0000_0000_0000
-event_id: 56 bits (least significant) 0x00ff_0000_0000_0000
+raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
+type: 7 bits (next most significant) 0x7f00_0000_0000_0000
+event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff
If 'raw_type' is 1, then the counter will count a hardware event
specified by the remaining 63 bits of event_config. The encoding is
@@ -134,41 +140,56 @@ enum sw_event_ids {
PERF_COUNT_PAGE_FAULTS_MAJ = 6,
};
+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
+tracer is available, and event_id values can be obtained from
+/debug/tracing/events/*/*/id
+
+
Counters come in two flavours: counting counters and sampling
counters. A "counting" counter is one that is used for counting the
number of events that occur, and is characterised by having
-irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a
-counting counter simply returns the current value of the counter as
-an 8-byte number.
+irq_period = 0.
+
+
+A read() on a counter returns the current value of the counter and possible
+additional values as specified by 'read_format', each value is a u64 (8 bytes)
+in size.
+
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+ PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
+ PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
+};
+
+Using these additional values one can establish the overcommit ratio for a
+particular counter allowing one to take the round-robin scheduling effect
+into account.
+
A "sampling" counter is one that is set up to generate an interrupt
every N events, where N is given by 'irq_period'. A sampling counter
-has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The
-record_type controls what data is recorded on each interrupt, and the
-available values are currently:
+has irq_period > 0. The record_type controls what data is recorded on each
+interrupt:
/*
- * IRQ-notification data record type:
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
*/
-enum perf_counter_record_type {
- PERF_RECORD_SIMPLE = 0,
- PERF_RECORD_IRQ = 1,
- PERF_RECORD_GROUP = 2,
-};
-
-A record_type value of PERF_RECORD_IRQ will record the instruction
-pointer (IP) at which the interrupt occurred. A record_type value of
-PERF_RECORD_GROUP will record the event_config and counter value of
-all of the other counters in the group, and should only be used on a
-group leader (see below). Currently these two values are mutually
-exclusive, but record_type will become a bit-mask in future and
-support other values.
-
-A sampling counter has an event queue, into which an event is placed
-on each interrupt. A read() on a sampling counter will read the next
-event from the event queue. If the queue is empty, the read() will
-either block or return an EAGAIN error, depending on whether the fd
-has been set to non-blocking mode or not.
+enum perf_counter_record_format {
+ PERF_RECORD_IP = 1U << 0,
+ PERF_RECORD_TID = 1U << 1,
+ PERF_RECORD_TIME = 1U << 2,
+ PERF_RECORD_ADDR = 1U << 3,
+ PERF_RECORD_GROUP = 1U << 4,
+ PERF_RECORD_CALLCHAIN = 1U << 5,
+};
+
+Such (and other) events will be recorded in a ring-buffer, which is
+available to user-space using mmap() (see below).
The 'disabled' bit specifies whether the counter starts out disabled
or enabled. If it is initially disabled, it can be enabled by ioctl
@@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and
way to request that counting of events be restricted to times when the
CPU is in user, kernel and/or hypervisor mode.
+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
+operations, these can be used to relate userspace IP addresses to actual
+code, even after the mapping (or even the whole process) is gone,
+these events are recorded in the ring-buffer (see below).
+
+The 'comm' bit allows tracking of process comm data on process creation.
+This too is recorded in the ring-buffer (see below).
The 'pid' parameter to the perf_counter_open() system call allows the
counter to be specific to a task:
@@ -250,6 +278,138 @@ can be meaningfully compared, added, div
with each other, since they have counted events for the same set of
executed instructions.
+
+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
+tracking are logged into a ring-buffer. This ring-buffer is created and
+accessed through mmap().
+
+The mmap size should be 1+2^n pages, where the first page is a meta-data page
+(struct perf_counter_mmap_page) that contains various bits of information such
+as where the ring-buffer head is.
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+ __u32 version; /* version number of this structure */
+ __u32 compat_version; /* lowest version this is compat with */
+
+ /*
+ * Bits needed to read the hw counters in user-space.
+ *
+ * u32 seq;
+ * s64 count;
+ *
+ * do {
+ * seq = pc->lock;
+ *
+ * barrier()
+ * if (pc->index) {
+ * count = pmc_read(pc->index - 1);
+ * count += pc->offset;
+ * } else
+ * goto regular_read;
+ *
+ * barrier();
+ * } while (pc->lock != seq);
+ *
+ * NOTE: for obvious reason this only works on self-monitoring
+ * processes.
+ */
+ __u32 lock; /* seqlock for synchronization */
+ __u32 index; /* hardware counter identifier */
+ __s64 offset; /* add to hardware counter value */
+
+ /*
+ * Control data for the mmap() data buffer.
+ *
+ * User-space reading this value should issue an rmb(), on SMP capable
+ * platforms, after reading this value -- see perf_counter_wakeup().
+ */
+ __u32 data_head; /* head in the data section */
+};
+
+NOTE: the hw-counter userspace bits are arch specific and are currently only
+ implemented on powerpc.
+
+The following 2^n pages are the ring-buffer which contains events of the form:
+
+#define PERF_EVENT_MISC_KERNEL (1 << 0)
+#define PERF_EVENT_MISC_USER (1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
+
+struct perf_event_header {
+ __u32 type;
+ __u16 misc;
+ __u16 size;
+};
+
+enum perf_event_type {
+
+ /*
+ * The MMAP events record the PROT_EXEC mappings so that we can
+ * correlate userspace IPs to code. They have the following structure:
+ *
+ * struct {
+ * struct perf_event_header header;
+ *
+ * u32 pid, tid;
+ * u64 addr;
+ * u64 len;
+ * u64 pgoff;
+ * char filename[];
+ * };
+ */
+ PERF_EVENT_MMAP = 1,
+ PERF_EVENT_MUNMAP = 2,
+
+ /*
+ * struct {
+ * struct perf_event_header header;
+ *
+ * u32 pid, tid;
+ * char comm[];
+ * };
+ */
+ PERF_EVENT_COMM = 3,
+
+ /*
+ * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+ * will be PERF_RECORD_*
+ *
+ * struct {
+ * struct perf_event_header header;
+ *
+ * { u64 ip; } && PERF_RECORD_IP
+ * { u32 pid, tid; } && PERF_RECORD_TID
+ * { u64 time; } && PERF_RECORD_TIME
+ * { u64 addr; } && PERF_RECORD_ADDR
+ *
+ * { u64 nr;
+ * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
+ *
+ * { u16 nr,
+ * hv,
+ * kernel,
+ * user;
+ * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
+ * };
+ */
+};
+
+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
+ on x86.
+
+Notification of new events is possible through poll()/select()/epoll() and
+fcntl() managing signals.
+
+Normally a notification is generated for every page filled, however one can
+additionally set perf_counter_hw_event.wakeup_events to generate one every
+so many counter overflow events.
+
+Future work will include a splice() interface to the ring-buffer.
+
+
Counters can be enabled and disabled in two ways: via ioctl and via
prctl. When a counter is disabled, it doesn't count or generate
events but does continue to exist and maintain its count value.
@@ -269,6 +429,12 @@ group other than the leader only affects
non-leader stops that counter from counting but doesn't affect any
other counter.
+Additionally, non-inherited overflow counters can use
+
+ ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+
+to enable a counter for 'nr' events, after which it gets disabled again.
+
A process can enable or disable all the counter groups that are
attached to it, using prctl:
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [tip:perfcounters/core] perf_counter: fix race in perf_output_*
2009-05-01 10:23 ` [PATCH 1/4] perf_counter: fix race in perf_output_* Peter Zijlstra
@ 2009-05-01 13:27 ` tip-bot for Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-01 13:27 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
mingo
Commit-ID: c33a0bc4e41ef169d6e807d8abb9502544b518e5
Gitweb: http://git.kernel.org/tip/c33a0bc4e41ef169d6e807d8abb9502544b518e5
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 1 May 2009 12:23:16 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 1 May 2009 13:23:43 +0200
perf_counter: fix race in perf_output_*
When two (or more) contexts output to the same buffer, it is possible
to observe half written output.
Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing
perf_counter_overflow(). If CPU1 does a wakeup and exposes head to
user-space, then CPU2 can observe the data CPU0 is still writing.
[ Impact: fix occasionally corrupted profiling records ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.007821627@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
include/linux/perf_counter.h | 5 +-
kernel/perf_counter.c | 130 ++++++++++++++++++++++++++++++++---------
2 files changed, 105 insertions(+), 30 deletions(-)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 41aed42..f776851 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,10 +358,13 @@ struct perf_mmap_data {
struct rcu_head rcu_head;
int nr_pages; /* nr of data pages */
- atomic_t wakeup; /* POLL_ for wakeups */
+ atomic_t poll; /* POLL_ for wakeups */
atomic_t head; /* write position */
atomic_t events; /* event limit */
+ atomic_t wakeup_head; /* completed head */
+ atomic_t lock; /* concurrent writes */
+
struct perf_counter_mmap_page *user_page;
void *data_pages[0];
};
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75f2b6c..8660ae5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_counter *counter = file->private_data;
struct perf_mmap_data *data;
- unsigned int events;
+ unsigned int events = POLL_HUP;
rcu_read_lock();
data = rcu_dereference(counter->data);
if (data)
- events = atomic_xchg(&data->wakeup, 0);
- else
- events = POLL_HUP;
+ events = atomic_xchg(&data->poll, 0);
rcu_read_unlock();
poll_wait(file, &counter->waitq, wait);
@@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = {
void perf_counter_wakeup(struct perf_counter *counter)
{
- struct perf_mmap_data *data;
-
- rcu_read_lock();
- data = rcu_dereference(counter->data);
- if (data) {
- atomic_set(&data->wakeup, POLL_IN);
- /*
- * Ensure all data writes are issued before updating the
- * user-space data head information. The matching rmb()
- * will be in userspace after reading this value.
- */
- smp_wmb();
- data->user_page->data_head = atomic_read(&data->head);
- }
- rcu_read_unlock();
-
wake_up_all(&counter->waitq);
if (counter->pending_kill) {
@@ -1721,10 +1703,14 @@ struct perf_output_handle {
int wakeup;
int nmi;
int overflow;
+ int locked;
+ unsigned long flags;
};
-static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+static void perf_output_wakeup(struct perf_output_handle *handle)
{
+ atomic_set(&handle->data->poll, POLL_IN);
+
if (handle->nmi) {
handle->counter->pending_wakeup = 1;
perf_pending_queue(&handle->counter->pending,
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
perf_counter_wakeup(handle->counter);
}
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ int cpu;
+
+ handle->locked = 0;
+
+ local_irq_save(handle->flags);
+ cpu = smp_processor_id();
+
+ if (in_nmi() && atomic_read(&data->lock) == cpu)
+ return;
+
+ while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+ cpu_relax();
+
+ handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ int head, cpu;
+
+ if (handle->wakeup)
+ data->wakeup_head = data->head;
+
+ if (!handle->locked)
+ goto out;
+
+again:
+ /*
+ * The xchg implies a full barrier that ensures all writes are done
+ * before we publish the new head, matched by a rmb() in userspace when
+ * reading this position.
+ */
+ while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+ data->user_page->data_head = head;
+ handle->wakeup = 1;
+ }
+
+ /*
+ * NMI can happen here, which means we can miss a wakeup_head update.
+ */
+
+ cpu = atomic_xchg(&data->lock, 0);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ /*
+ * Therefore we have to validate we did not indeed do so.
+ */
+ if (unlikely(atomic_read(&data->wakeup_head))) {
+ /*
+ * Since we had it locked, we can lock it again.
+ */
+ while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+ cpu_relax();
+
+ goto again;
+ }
+
+ if (handle->wakeup)
+ perf_output_wakeup(handle);
+out:
+ local_irq_restore(handle->flags);
+}
+
static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size,
int nmi, int overflow)
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
if (!data)
goto out;
+ handle->data = data;
handle->counter = counter;
handle->nmi = nmi;
handle->overflow = overflow;
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
if (!data->nr_pages)
goto fail;
+ perf_output_lock(handle);
+
do {
offset = head = atomic_read(&data->head);
head += size;
} while (atomic_cmpxchg(&data->head, offset, head) != offset);
- handle->data = data;
handle->offset = offset;
handle->head = head;
handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
return 0;
fail:
- __perf_output_wakeup(handle);
+ perf_output_wakeup(handle);
out:
rcu_read_unlock();
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle,
static void perf_output_end(struct perf_output_handle *handle)
{
- int wakeup_events = handle->counter->hw_event.wakeup_events;
+ struct perf_counter *counter = handle->counter;
+ struct perf_mmap_data *data = handle->data;
+
+ int wakeup_events = counter->hw_event.wakeup_events;
if (handle->overflow && wakeup_events) {
- int events = atomic_inc_return(&handle->data->events);
+ int events = atomic_inc_return(&data->events);
if (events >= wakeup_events) {
- atomic_sub(wakeup_events, &handle->data->events);
- __perf_output_wakeup(handle);
+ atomic_sub(wakeup_events, &data->events);
+ handle->wakeup = 1;
}
- } else if (handle->wakeup)
- __perf_output_wakeup(handle);
+ }
+
+ perf_output_unlock(handle);
rcu_read_unlock();
}
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [tip:perfcounters/core] perf_counter: fix nmi-watchdog interaction
2009-05-01 10:23 ` [PATCH 2/4] perf_counter: fix nmi-watchdog interaction Peter Zijlstra
@ 2009-05-01 13:27 ` tip-bot for Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-01 13:27 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
mingo
Commit-ID: 63a809a2dc53b91268dd915bbcbd425063893676
Gitweb: http://git.kernel.org/tip/63a809a2dc53b91268dd915bbcbd425063893676
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 1 May 2009 12:23:17 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 1 May 2009 13:23:44 +0200
perf_counter: fix nmi-watchdog interaction
When we don't have any perf-counters active, don't act like we know
what the NMI is for.
[ Impact: fix hard hang with nmi_watchdog=2 ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.109867793@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/x86/kernel/cpu/perf_counter.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc06f4d..d4c0cc9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
struct pt_regs *regs;
int ret;
+ if (!atomic_read(&num_counters))
+ return NOTIFY_DONE;
+
switch (cmd) {
case DIE_NMI:
case DIE_NMI_IPI:
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [tip:perfcounters/core] perf_counter: tool: handle 0-length data files
2009-05-01 10:23 ` [PATCH 3/4] perf_counter: tool: handle 0-length data files Peter Zijlstra
@ 2009-05-01 13:27 ` tip-bot for Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-01 13:27 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
mingo
Commit-ID: 585e3374d9d29376c2c37d821c8b7637dd48ca95
Gitweb: http://git.kernel.org/tip/585e3374d9d29376c2c37d821c8b7637dd48ca95
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 1 May 2009 12:23:18 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 1 May 2009 13:23:44 +0200
perf_counter: tool: handle 0-length data files
Avoid perf-report barfing on 0-length data files.
[ Impact: fix perf-report SIGBUS ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.196245693@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
Documentation/perf_counter/perf-report.cc | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 933a075..911d7f3 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -402,6 +402,11 @@ int main(int argc, char *argv[])
exit(-1);
}
+ if (!stat.st_size) {
+ fprintf(stderr, "zero-sized file, nothing to do!\n");
+ exit(0);
+ }
+
load_kallsyms();
remap:
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [tip:perfcounters/core] perf_counter: documentation update
2009-05-01 10:23 ` [PATCH 4/4] perf_counter: documetation update Peter Zijlstra
@ 2009-05-01 13:28 ` tip-bot for Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-05-01 13:28 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
mingo
Commit-ID: e5791a808ae91a9e7e1b65ea9b8de0f96a043d88
Gitweb: http://git.kernel.org/tip/e5791a808ae91a9e7e1b65ea9b8de0f96a043d88
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 1 May 2009 12:23:19 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 1 May 2009 13:23:45 +0200
perf_counter: documentation update
Update the documentation to reflect the current state of affairs
[ Impact: documentation update ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.296727903@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
Documentation/perf_counter/design.txt | 274 ++++++++++++++++++++++++++-------
1 files changed, 220 insertions(+), 54 deletions(-)
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index aaf105c..9930c4b 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -34,41 +34,47 @@ can be poll()ed.
When creating a new counter fd, 'perf_counter_hw_event' is:
-/*
- * Event to monitor via a performance monitoring counter:
- */
struct perf_counter_hw_event {
- __u64 event_config;
-
- __u64 irq_period;
- __u64 record_type;
- __u64 read_format;
-
- __u64 disabled : 1, /* off by default */
- nmi : 1, /* NMI sampling */
- inherit : 1, /* children inherit it */
- pinned : 1, /* must always be on PMU */
- exclusive : 1, /* only group on PMU */
- exclude_user : 1, /* don't count user */
- exclude_kernel : 1, /* ditto kernel */
- exclude_hv : 1, /* ditto hypervisor */
- exclude_idle : 1, /* don't count when idle */
-
- __reserved_1 : 55;
-
- __u32 extra_config_len;
-
- __u32 __reserved_4;
- __u64 __reserved_2;
- __u64 __reserved_3;
+ /*
+ * The MSB of the config word signifies if the rest contains cpu
+ * specific (raw) counter configuration data, if unset, the next
+ * 7 bits are an event type and the rest of the bits are the event
+ * identifier.
+ */
+ __u64 config;
+
+ __u64 irq_period;
+ __u32 record_type;
+ __u32 read_format;
+
+ __u64 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ inherit : 1, /* children inherit it */
+ pinned : 1, /* must always be on PMU */
+ exclusive : 1, /* only group on PMU */
+ exclude_user : 1, /* don't count user */
+ exclude_kernel : 1, /* ditto kernel */
+ exclude_hv : 1, /* ditto hypervisor */
+ exclude_idle : 1, /* don't count when idle */
+ mmap : 1, /* include mmap data */
+ munmap : 1, /* include munmap data */
+ comm : 1, /* include comm data */
+
+ __reserved_1 : 52;
+
+ __u32 extra_config_len;
+ __u32 wakeup_events; /* wakeup every n events */
+
+ __u64 __reserved_2;
+ __u64 __reserved_3;
};
-The 'event_config' field specifies what the counter should count. It
+The 'config' field specifies what the counter should count. It
is divided into 3 bit-fields:
-raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
-type: 7 bits (next most significant) 0x7f00_0000_0000_0000
-event_id: 56 bits (least significant) 0x00ff_0000_0000_0000
+raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
+type: 7 bits (next most significant) 0x7f00_0000_0000_0000
+event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff
If 'raw_type' is 1, then the counter will count a hardware event
specified by the remaining 63 bits of event_config. The encoding is
@@ -134,41 +140,56 @@ enum sw_event_ids {
PERF_COUNT_PAGE_FAULTS_MAJ = 6,
};
+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
+tracer is available, and event_id values can be obtained from
+/debug/tracing/events/*/*/id
+
+
Counters come in two flavours: counting counters and sampling
counters. A "counting" counter is one that is used for counting the
number of events that occur, and is characterised by having
-irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a
-counting counter simply returns the current value of the counter as
-an 8-byte number.
+irq_period = 0.
+
+
+A read() on a counter returns the current value of the counter and possible
+additional values as specified by 'read_format', each value is a u64 (8 bytes)
+in size.
+
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+ PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
+ PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
+};
+
+Using these additional values one can establish the overcommit ratio for a
+particular counter allowing one to take the round-robin scheduling effect
+into account.
+
A "sampling" counter is one that is set up to generate an interrupt
every N events, where N is given by 'irq_period'. A sampling counter
-has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The
-record_type controls what data is recorded on each interrupt, and the
-available values are currently:
+has irq_period > 0. The record_type controls what data is recorded on each
+interrupt:
/*
- * IRQ-notification data record type:
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
*/
-enum perf_counter_record_type {
- PERF_RECORD_SIMPLE = 0,
- PERF_RECORD_IRQ = 1,
- PERF_RECORD_GROUP = 2,
+enum perf_counter_record_format {
+ PERF_RECORD_IP = 1U << 0,
+ PERF_RECORD_TID = 1U << 1,
+ PERF_RECORD_TIME = 1U << 2,
+ PERF_RECORD_ADDR = 1U << 3,
+ PERF_RECORD_GROUP = 1U << 4,
+ PERF_RECORD_CALLCHAIN = 1U << 5,
};
-A record_type value of PERF_RECORD_IRQ will record the instruction
-pointer (IP) at which the interrupt occurred. A record_type value of
-PERF_RECORD_GROUP will record the event_config and counter value of
-all of the other counters in the group, and should only be used on a
-group leader (see below). Currently these two values are mutually
-exclusive, but record_type will become a bit-mask in future and
-support other values.
-
-A sampling counter has an event queue, into which an event is placed
-on each interrupt. A read() on a sampling counter will read the next
-event from the event queue. If the queue is empty, the read() will
-either block or return an EAGAIN error, depending on whether the fd
-has been set to non-blocking mode or not.
+Such (and other) events will be recorded in a ring-buffer, which is
+available to user-space using mmap() (see below).
The 'disabled' bit specifies whether the counter starts out disabled
or enabled. If it is initially disabled, it can be enabled by ioctl
@@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
way to request that counting of events be restricted to times when the
CPU is in user, kernel and/or hypervisor mode.
+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
+operations, these can be used to relate userspace IP addresses to actual
+code, even after the mapping (or even the whole process) is gone,
+these events are recorded in the ring-buffer (see below).
+
+The 'comm' bit allows tracking of process comm data on process creation.
+This too is recorded in the ring-buffer (see below).
The 'pid' parameter to the perf_counter_open() system call allows the
counter to be specific to a task:
@@ -250,6 +278,138 @@ can be meaningfully compared, added, divided (to get ratios), etc.,
with each other, since they have counted events for the same set of
executed instructions.
+
+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
+tracking are logged into a ring-buffer. This ring-buffer is created and
+accessed through mmap().
+
+The mmap size should be 1+2^n pages, where the first page is a meta-data page
+(struct perf_counter_mmap_page) that contains various bits of information such
+as where the ring-buffer head is.
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+ __u32 version; /* version number of this structure */
+ __u32 compat_version; /* lowest version this is compat with */
+
+ /*
+ * Bits needed to read the hw counters in user-space.
+ *
+ * u32 seq;
+ * s64 count;
+ *
+ * do {
+ * seq = pc->lock;
+ *
+ * barrier()
+ * if (pc->index) {
+ * count = pmc_read(pc->index - 1);
+ * count += pc->offset;
+ * } else
+ * goto regular_read;
+ *
+ * barrier();
+ * } while (pc->lock != seq);
+ *
+ * NOTE: for obvious reason this only works on self-monitoring
+ * processes.
+ */
+ __u32 lock; /* seqlock for synchronization */
+ __u32 index; /* hardware counter identifier */
+ __s64 offset; /* add to hardware counter value */
+
+ /*
+ * Control data for the mmap() data buffer.
+ *
+ * User-space reading this value should issue an rmb(), on SMP capable
+ * platforms, after reading this value -- see perf_counter_wakeup().
+ */
+ __u32 data_head; /* head in the data section */
+};
+
+NOTE: the hw-counter userspace bits are arch specific and are currently only
+ implemented on powerpc.
+
+The following 2^n pages are the ring-buffer which contains events of the form:
+
+#define PERF_EVENT_MISC_KERNEL (1 << 0)
+#define PERF_EVENT_MISC_USER (1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
+
+struct perf_event_header {
+ __u32 type;
+ __u16 misc;
+ __u16 size;
+};
+
+enum perf_event_type {
+
+ /*
+ * The MMAP events record the PROT_EXEC mappings so that we can
+ * correlate userspace IPs to code. They have the following structure:
+ *
+ * struct {
+ * struct perf_event_header header;
+ *
+ * u32 pid, tid;
+ * u64 addr;
+ * u64 len;
+ * u64 pgoff;
+ * char filename[];
+ * };
+ */
+ PERF_EVENT_MMAP = 1,
+ PERF_EVENT_MUNMAP = 2,
+
+ /*
+ * struct {
+ * struct perf_event_header header;
+ *
+ * u32 pid, tid;
+ * char comm[];
+ * };
+ */
+ PERF_EVENT_COMM = 3,
+
+ /*
+ * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+ * will be PERF_RECORD_*
+ *
+ * struct {
+ * struct perf_event_header header;
+ *
+ * { u64 ip; } && PERF_RECORD_IP
+ * { u32 pid, tid; } && PERF_RECORD_TID
+ * { u64 time; } && PERF_RECORD_TIME
+ * { u64 addr; } && PERF_RECORD_ADDR
+ *
+ * { u64 nr;
+ * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
+ *
+ * { u16 nr,
+ * hv,
+ * kernel,
+ * user;
+ * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
+ * };
+ */
+};
+
+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
+ on x86.
+
+Notification of new events is possible through poll()/select()/epoll() and
+fcntl() managing signals.
+
+Normally a notification is generated for every page filled, however one can
+additionally set perf_counter_hw_event.wakeup_events to generate one every
+so many counter overflow events.
+
+Future work will include a splice() interface to the ring-buffer.
+
+
Counters can be enabled and disabled in two ways: via ioctl and via
prctl. When a counter is disabled, it doesn't count or generate
events but does continue to exist and maintain its count value.
@@ -269,6 +429,12 @@ group other than the leader only affects that counter - disabling an
non-leader stops that counter from counting but doesn't affect any
other counter.
+Additionally, non-inherited overflow counters can use
+
+ ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+
+to enable a counter for 'nr' events, after which it gets disabled again.
+
A process can enable or disable all the counter groups that are
attached to it, using prctl:
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2009-05-01 13:29 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-01 10:23 [PATCH 0/4] perf_counter bits Peter Zijlstra
2009-05-01 10:23 ` [PATCH 1/4] perf_counter: fix race in perf_output_* Peter Zijlstra
2009-05-01 13:27 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-01 10:23 ` [PATCH 2/4] perf_counter: fix nmi-watchdog interaction Peter Zijlstra
2009-05-01 13:27 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-01 10:23 ` [PATCH 3/4] perf_counter: tool: handle 0-length data files Peter Zijlstra
2009-05-01 13:27 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-05-01 10:23 ` [PATCH 4/4] perf_counter: documetation update Peter Zijlstra
2009-05-01 13:28 ` [tip:perfcounters/core] perf_counter: documentation update tip-bot for Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox