[PATCH 0/6] more perf

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/6] more perf_counter stuff
@ 2009-04-02  9:11 Peter Zijlstra
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
                   ` (5 more replies)
  0 siblings, 6 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:11 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

This patch set implements some features requested by Paul/Corey.
The main one being the singleshot thing, which is still RFC due to
being incomplete.

It really needs a mechanism to run bits in a regular context ASAP
after the event context. My faviourite is a self-IPI, because one can
issue it from NMI context, or IRQ-disabled rq->lock context and have
it work as expected.

I'll look at doing a x86 self-IPI.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
@ 2009-04-02  9:11 ` Peter Zijlstra
  2009-04-02 11:28   ` Ingo Molnar
                     ` (3 more replies)
  2009-04-02  9:12 ` [PATCH 2/6] RFC perf_counter: singleshot support Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 4 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:11 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

[-- Attachment #1: perf_counter-record_type.patch --]
[-- Type: text/plain, Size: 7098 bytes --]

Per suggestion from Paul, move the event overflow bits to record_type
and sanitize the enums a bit.

Breaks the ABI -- again ;-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |   50 ++++++++++++---------
 kernel/perf_counter.c        |   99 ++++++++++++++++---------------------------
 2 files changed, 67 insertions(+), 82 deletions(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -73,15 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
-};
-
 #define __PERF_COUNTER_MASK(name) 			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
@@ -103,6 +94,17 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+	PERF_RECORD_IP		= 1U << 0,
+	PERF_RECORD_TID		= 1U << 1,
+	PERF_RECORD_GROUP	= 1U << 2,
+	PERF_RECORD_CALLCHAIN	= 1U << 3,
+};
+
+/*
  * Bits that can be set in hw_event.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
@@ -125,8 +127,8 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
+	__u32			record_type;
+	__u32			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -137,12 +139,10 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
-				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 51;
+				__reserved_1   : 53;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -212,15 +212,21 @@ struct perf_event_header {
 
 enum perf_event_type {
 
-	PERF_EVENT_GROUP	= 1,
+	PERF_EVENT_MMAP			= 1,
+	PERF_EVENT_MUNMAP		= 2,
 
-	PERF_EVENT_MMAP		= 2,
-	PERF_EVENT_MUNMAP	= 3,
-
-	PERF_EVENT_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP		= 1UL << 30,
-	__PERF_EVENT_TID	= 1UL << 29,
-	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
+	/*
+	 * Half the event type space is reserved for the counter overflow
+	 * bitfields, as found in hw_event.record_type.
+	 *
+	 * These events will have types of the form:
+	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 */
+	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP			= PERF_RECORD_IP,
+	__PERF_EVENT_TID		= PERF_RECORD_TID,
+	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
+	__PERF_EVENT_CALLCHAIN  	= PERF_RECORD_CALLCHAIN,
 };
 
 #ifdef __KERNEL__
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_
 	rcu_read_unlock();
 }
 
-static void perf_output_simple(struct perf_counter *counter,
-			       int nmi, struct pt_regs *regs)
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
 {
 	int ret;
+	u64 record_type = counter->hw_event.record_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct {
+		u64 event;
+		u64 counter;
+	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 
 	header.type = PERF_EVENT_OVERFLOW;
 	header.size = sizeof(header);
 
-	ip = instruction_pointer(regs);
-	header.type |= __PERF_EVENT_IP;
-	header.size += sizeof(ip);
+	if (record_type & PERF_RECORD_IP) {
+		ip = instruction_pointer(regs);
+		header.type |= __PERF_EVENT_IP;
+		header.size += sizeof(ip);
+	}
 
-	if (counter->hw_event.include_tid) {
+	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
@@ -1794,7 +1801,13 @@ static void perf_output_simple(struct pe
 		header.size += sizeof(tid_entry);
 	}
 
-	if (counter->hw_event.callchain) {
+	if (record_type & PERF_RECORD_GROUP) {
+		header.type |= __PERF_EVENT_GROUP;
+		header.size += sizeof(u64) +
+			counter->nr_siblings * sizeof(group_entry);
+	}
+
+	if (record_type & PERF_RECORD_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
@@ -1810,69 +1823,35 @@ static void perf_output_simple(struct pe
 		return;
 
 	perf_output_put(&handle, header);
-	perf_output_put(&handle, ip);
 
-	if (counter->hw_event.include_tid)
-		perf_output_put(&handle, tid_entry);
+	if (record_type & PERF_RECORD_IP)
+		perf_output_put(&handle, ip);
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
-
-	perf_output_end(&handle);
-}
-
-static void perf_output_group(struct perf_counter *counter, int nmi)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_counter *leader, *sub;
-	unsigned int size;
-	struct {
-		u64 event;
-		u64 counter;
-	} entry;
-	int ret;
-
-	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+	if (record_type & PERF_RECORD_TID)
+		perf_output_put(&handle, tid_entry);
 
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		return;
+	if (record_type & PERF_RECORD_GROUP) {
+		struct perf_counter *leader, *sub;
+		u64 nr = counter->nr_siblings;
 
-	header.type = PERF_EVENT_GROUP;
-	header.size = size;
+		perf_output_put(&handle, nr);
 
-	perf_output_put(&handle, header);
+		leader = counter->group_leader;
+		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+			if (sub != counter)
+				sub->hw_ops->read(sub);
 
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
+			group_entry.event = sub->hw_event.config;
+			group_entry.counter = atomic64_read(&sub->count);
 
-		entry.event = sub->hw_event.config;
-		entry.counter = atomic64_read(&sub->count);
-
-		perf_output_put(&handle, entry);
+			perf_output_put(&handle, group_entry);
+		}
 	}
 
-	perf_output_end(&handle);
-}
-
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return;
-
-	case PERF_RECORD_IRQ:
-		perf_output_simple(counter, nmi, regs);
-		break;
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
 
-	case PERF_RECORD_GROUP:
-		perf_output_group(counter, nmi);
-		break;
-	}
+	perf_output_end(&handle);
 }
 
 /*

-- 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
@ 2009-04-02  9:12 ` Peter Zijlstra
  2009-04-02 10:51   ` Ingo Molnar
  2009-04-02 12:18   ` Peter Zijlstra
  2009-04-02  9:12 ` [PATCH 3/6] perf_counter: per event wakeups Peter Zijlstra
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

[-- Attachment #1: perf_counter-singleshot.patch --]
[-- Type: text/plain, Size: 7514 bytes --]

By request, provide a way for counters to disable themselves and signal
at the first counter overflow.

This isn't complete, we really want pending work to be done ASAP after
queueing it. My preferred method would be a self-IPI, that would ensure
we run the code in a usable context right after the current (IRQ-off,
NMI) context is done.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/powerpc/kernel/perf_counter.c |    2 
 arch/x86/kernel/cpu/perf_counter.c |    2 
 include/linux/perf_counter.h       |   21 +++++---
 kernel/perf_counter.c              |   94 ++++++++++++++++++++++++++++---------
 4 files changed, 89 insertions(+), 30 deletions(-)

Index: linux-2.6/arch/powerpc/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_counter.c
+++ linux-2.6/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct pe
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_output(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs);
 }
 
 /*
Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		perf_counter_output(counter, nmi, regs);
+		perf_counter_overflow(counter, nmi, regs);
 	}
 
 	hw_perf_ack_status(ack);
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -141,13 +141,19 @@ struct perf_counter_hw_event {
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				singleshot     :  1, /* singleshot overflow   */
 
-				__reserved_1   : 53;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
 
-	__u64			__reserved_2;
+	/*
+	 * Singleshot signal information.
+	 */
+	__u32			signal_nr;
+	__u32			signal_tid;
+
 	__u64			__reserved_3;
 };
 
@@ -325,8 +331,9 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
-struct perf_wakeup_entry {
-	struct perf_wakeup_entry *next;
+struct perf_pending_entry {
+	struct perf_pending_entry *next;
+	void (*func)(struct perf_pending_entry *);
 };
 
 /**
@@ -404,7 +411,7 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
-	struct perf_wakeup_entry	wakeup;
+	struct perf_pending_entry	pending;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
@@ -493,8 +500,8 @@ extern int hw_perf_group_sched_in(struct
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs);
+extern void perf_counter_overflow(struct perf_counter *counter,
+				  int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1563,6 +1563,14 @@ void perf_counter_wakeup(struct perf_cou
 	wake_up_all(&counter->waitq);
 }
 
+static void perf_pending_wakeup(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	perf_counter_wakeup(counter);
+}
+
 /*
  * Pending wakeups
  *
@@ -1572,45 +1580,47 @@ void perf_counter_wakeup(struct perf_cou
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
-#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
-static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
 	PENDING_TAIL,
 };
 
-static void perf_pending_queue(struct perf_counter *counter)
+static void perf_pending_queue(struct perf_pending_entry *entry,
+			       void (*func)(struct perf_pending_entry *))
 {
-	struct perf_wakeup_entry **head;
-	struct perf_wakeup_entry *prev, *next;
+	struct perf_pending_entry **head;
 
-	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
 		return;
 
-	head = &get_cpu_var(perf_wakeup_head);
+	entry->func = func;
+
+	head = &get_cpu_var(perf_pending_head);
 
 	do {
-		prev = counter->wakeup.next = *head;
-		next = &counter->wakeup;
-	} while (cmpxchg(head, prev, next) != prev);
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
 
 	set_perf_counter_pending();
 
-	put_cpu_var(perf_wakeup_head);
+	put_cpu_var(perf_pending_head);
 }
 
 static int __perf_pending_run(void)
 {
-	struct perf_wakeup_entry *list;
+	struct perf_pending_entry *list;
 	int nr = 0;
 
-	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
 	while (list != PENDING_TAIL) {
-		struct perf_counter *counter = container_of(list,
-				struct perf_counter, wakeup);
+		void (*func)(struct perf_pending_entry *);
+		struct perf_pending_entry *entry = list;
 
 		list = list->next;
 
-		counter->wakeup.next = NULL;
+		entry->next = NULL;
+		func = entry->func;
 		/*
 		 * Ensure we observe the unqueue before we issue the wakeup,
 		 * so that we won't be waiting forever.
@@ -1618,7 +1628,7 @@ static int __perf_pending_run(void)
 		 */
 		smp_wmb();
 
-		perf_counter_wakeup(counter);
+		func(entry);
 		nr++;
 	}
 
@@ -1640,7 +1650,7 @@ static inline int perf_not_pending(struc
 	 * so that we do not miss the wakeup. -- see perf_pending_handle()
 	 */
 	smp_rmb();
-	return counter->wakeup.next == NULL;
+	return counter->pending.next == NULL;
 }
 
 static void perf_pending_sync(struct perf_counter *counter)
@@ -1679,9 +1689,10 @@ struct perf_output_handle {
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
-	if (handle->nmi)
-		perf_pending_queue(handle->counter);
-	else
+	if (handle->nmi) {
+		perf_pending_queue(&handle->counter->pending,
+				   perf_pending_wakeup);
+	} else
 		perf_counter_wakeup(handle->counter);
 }
 
@@ -1999,6 +2010,47 @@ void perf_counter_munmap(unsigned long a
 }
 
 /*
+ * Generic counter overflow handling.
+ */
+
+void perf_counter_singleshot(struct perf_counter *counter)
+{
+	struct pid *pid;
+
+	perf_counter_disable(counter);
+
+	rcu_read_lock();
+	pid = find_vpid(counter->hw_event.signal_tid);
+	if (pid)
+		kill_pid(pid, counter->hw_event.signal_nr, 1);
+	rcu_read_unlock();
+}
+
+void perf_pending_singleshot(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	perf_counter_singleshot(counter);
+}
+
+void perf_counter_overflow(struct perf_counter *counter,
+			   int nmi, struct pt_regs *regs)
+{
+	if (counter->hw_event.singleshot) {
+		if (nmi) {
+			perf_pending_queue(&counter->pending,
+					   perf_pending_singleshot);
+		} else
+			perf_counter_singleshot(counter);
+
+		return;
+	}
+
+	perf_counter_output(counter, nmi, regs);
+}
+
+/*
  * Generic software counter infrastructure
  */
 

-- 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 3/6] perf_counter: per event wakeups
  2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
  2009-04-02  9:12 ` [PATCH 2/6] RFC perf_counter: singleshot support Peter Zijlstra
@ 2009-04-02  9:12 ` Peter Zijlstra
  2009-04-02 11:32   ` Ingo Molnar
  2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
  2009-04-02  9:12 ` [PATCH 4/6] perf_counter: kerneltop: update to new ABI Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

[-- Attachment #1: perf_counter-event-wakeup.patch --]
[-- Type: text/plain, Size: 1634 bytes --]

By request, provide a way to request a wakeup every 'n' events instead
of every page of output.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |    3 ++-
 kernel/perf_counter.c        |   10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -146,7 +146,7 @@ struct perf_counter_hw_event {
 				__reserved_1   : 52;
 
 	__u32			extra_config_len;
-	__u32			__reserved_4;
+	__u32			wakeup_events;	/* wakeup every n events */
 
 	/*
 	 * Singleshot signal information.
@@ -327,6 +327,7 @@ struct perf_mmap_data {
 	int				nr_pages;
 	atomic_t			wakeup;
 	atomic_t			head;
+	atomic_t			events;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup)
+	int wakeup_events = handle->counter->hw_event.wakeup_events;
+
+	if (wakeup_events) {
+		int events = atomic_inc_return(&handle->data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &handle->data->events);
+			__perf_output_wakeup(handle);
+		}
+	} else if (handle->wakeup)
 		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }

-- 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 4/6] perf_counter: kerneltop: update to new ABI
  2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
                   ` (2 preceding siblings ...)
  2009-04-02  9:12 ` [PATCH 3/6] perf_counter: per event wakeups Peter Zijlstra
@ 2009-04-02  9:12 ` Peter Zijlstra
  2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
  2009-04-02  9:12 ` [PATCH 5/6] perf_counter: add more context information Peter Zijlstra
  2009-04-02  9:12 ` [PATCH 6/6] perf_counter: update mmap() counter read Peter Zijlstra
  5 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

[-- Attachment #1: kerneltop-new-abi.patch --]
[-- Type: text/plain, Size: 1602 bytes --]

Update to reflect the new record_type ABI changes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 Documentation/perf_counter/kerneltop.c |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

Index: linux-2.6/Documentation/perf_counter/kerneltop.c
===================================================================
--- linux-2.6.orig/Documentation/perf_counter/kerneltop.c
+++ linux-2.6/Documentation/perf_counter/kerneltop.c
@@ -442,7 +442,7 @@ static void create_perfstat_counter(int 
 
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
-	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.record_type	= 0;
 	hw_event.nmi		= 0;
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -1277,8 +1277,8 @@ static void mmap_read(struct mmap_data *
 		old += size;
 
 		switch (event->header.type) {
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP:
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
 			process_event(event->ip.ip, md->counter);
 			break;
 
@@ -1337,9 +1337,8 @@ int main(int argc, char *argv[])
 			memset(&hw_event, 0, sizeof(hw_event));
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IRQ;
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
 			hw_event.nmi		= nmi;
-			hw_event.include_tid	= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 

-- 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 5/6] perf_counter: add more context information
  2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
                   ` (3 preceding siblings ...)
  2009-04-02  9:12 ` [PATCH 4/6] perf_counter: kerneltop: update to new ABI Peter Zijlstra
@ 2009-04-02  9:12 ` Peter Zijlstra
  2009-04-02 11:36   ` Ingo Molnar
                     ` (2 more replies)
  2009-04-02  9:12 ` [PATCH 6/6] perf_counter: update mmap() counter read Peter Zijlstra
  5 siblings, 3 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

[-- Attachment #1: perf_counter_callchain_context.patch --]
[-- Type: text/plain, Size: 2665 bytes --]

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/cpu/perf_counter.c |    9 +++++++++
 include/linux/perf_counter.h       |    4 ++--
 kernel/perf_counter.c              |    2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *re
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *re
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callch
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -520,10 +520,10 @@ extern void perf_counter_mmap(unsigned l
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH		254
 
 struct perf_callchain_entry {
-	u64	nr;
+	u32	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1830,7 +1830,7 @@ void perf_counter_output(struct perf_cou
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
+			callchain_size = (2 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;

-- 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 6/6] perf_counter: update mmap() counter read
  2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
                   ` (4 preceding siblings ...)
  2009-04-02  9:12 ` [PATCH 5/6] perf_counter: add more context information Peter Zijlstra
@ 2009-04-02  9:12 ` Peter Zijlstra
  2009-04-02 12:04   ` [tip:perfcounters/core] " Peter Zijlstra
  5 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02  9:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel, Peter Zijlstra

[-- Attachment #1: perf_counter-mmap_counter_read.patch --]
[-- Type: text/plain, Size: 2648 bytes --]

Paul noted that we don't need SMP barriers for the mmap() counter read
because its always on the same cpu (otherwise you can't access the hw
counter anyway).

So remove the SMP barriers and replace them with regular compiler
barriers.

Further, update the comment to include a race free method of readin
said hardware counter. The primary change is putting the pmc_read
inside the seq-loop, otherwise we can still race and read rubbish.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |   30 ++++++++++++++----------------
 kernel/perf_counter.c        |    4 ++--
 2 files changed, 16 insertions(+), 18 deletions(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -173,30 +173,28 @@ struct perf_counter_mmap_page {
 	/*
 	 * Bits needed to read the hw counters in user-space.
 	 *
-	 * The index and offset should be read atomically using the seqlock:
-	 *
-	 *   __u32 seq, index;
-	 *   __s64 offset;
-	 *
-	 * again:
-	 *   rmb();
-	 *   seq = pc->lock;
+	 *   u32 seq;
+	 *   s64 count;
 	 *
+	 * again:
+	 *   seq = pc->lock;
 	 *   if (unlikely(seq & 1)) {
 	 *     cpu_relax();
-	 *     goto again;
-	 *   }
+	 *     goto again;
+	 *   }
 	 *
-	 *   index = pc->index;
-	 *   offset = pc->offset;
+	 *   if (pc->index) {
+	 *     count = pmc_read(pc->index - 1);
+	 *     count += pc->offset;
+	 *   } else
+	 *     goto regular_read;
 	 *
-	 *   rmb();
+	 *   barrier();
 	 *   if (pc->lock != seq)
 	 *     goto again;
 	 *
-	 * After this, index contains architecture specific counter index + 1,
-	 * so that 0 means unavailable, offset contains the value to be added
-	 * to the result of the raw timer read to obtain this counter's value.
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
 	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct
 	 */
 	preempt_disable();
 	++userpg->lock;
-	smp_wmb();
+	barrier();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	smp_wmb();
+	barrier();
 	++userpg->lock;
 	preempt_enable();
 unlock:

-- 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02  9:12 ` [PATCH 2/6] RFC perf_counter: singleshot support Peter Zijlstra
@ 2009-04-02 10:51   ` Ingo Molnar
  2009-04-02 11:48     ` Peter Zijlstra
  2009-04-02 12:18   ` Peter Zijlstra
  1 sibling, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 10:51 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> By request, provide a way for counters to disable themselves and 
> signal at the first counter overflow.
> 
> This isn't complete, we really want pending work to be done ASAP 
> after queueing it. My preferred method would be a self-IPI, that 
> would ensure we run the code in a usable context right after the 
> current (IRQ-off, NMI) context is done.

Hm. I do think self-IPIs can be fragile but the more work we do in 
NMI context the more compelling of a case can be made for a 
self-IPI. So no big arguments against that.

Regarding single-shot - i think the code you posted fails to do the 
single-shot aspect: the counter does not stop, to wait for the 
signal handler to get its act together.

So i'd suggest to separate the two concepts: add signal support and 
single-shot via two attributes. User-space might decide to use 
single-shot if it does not want to deal with queued signals or lost 
events due to not processing the previous signal fast enough.

Also, user-space might want to use single-shot _without_ signals 
perhaps.

And that brings up that user-space might want to generate N events 
and stop then, reliably. I.e. single-shot is a specific form of a 
"trigger limit".

Plus the question comes up: dont we need an ioctl to let user-space 
refill/re-enable the trigger limit?

That is an issue for the signal case as well: we stop the counter 
... what restarts it? Does the signal handler have to close the 
counter and re-create it just to get the next single-shot event?

So i think we need 3 separate things:

 - the ability to set a signal attribute of the counter (during 
   creation) via a (signo,tid) pair. 

   Semantics:

    - it can be a regular signal (signo < 32),
      or an RT/queued signal (signo >= 32).

    - It may be sent to the task that generated the event (tid == 0), 
      or it may be sent to a specific task (tid > 0),
      or it may be sent to a task group (tid < 0).

 - 'event limit' attribute: the ability to pause new events after N 
   events. This limit auto-decrements on each event.
   limit==1 is the special case for single-shot.

 - new ioctl method to refill the limit, when user-space is ready to 
   receive new events. A special-case of this is when a signal 
   handler calls ioctl(refill_limit, 1) in the single-shot case - 
   this re-enables events after the signal has been handled.

Another observation: i think perf_counter_output() needs to depend 
on whether the counter is signalling, not on the single-shot-ness of 
the counter.

A completely valid use of this would be for user-space to create an 
mmap() buffer of 1024 events, then set the limit to 1024, and wait 
for the 1024 events to happen - process them and close the counter. 
Without any signalling.

Basically the 'limit' allows counter events to be used as a tracer 
in essence. So i think it's a nice touch of the whole scheme, and it 
should be decoupled from the signal attribute.

Hm?

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
@ 2009-04-02 11:28   ` Ingo Molnar
  2009-04-02 11:43   ` Ingo Molnar
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 11:28 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Per suggestion from Paul, move the event overflow bits to record_type
> and sanitize the enums a bit.
> 
> Breaks the ABI -- again ;-)
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/perf_counter.h |   50 ++++++++++++---------
>  kernel/perf_counter.c        |   99 ++++++++++++++++---------------------------
>  2 files changed, 67 insertions(+), 82 deletions(-)

nice cleanup!

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 3/6] perf_counter: per event wakeups
  2009-04-02  9:12 ` [PATCH 3/6] perf_counter: per event wakeups Peter Zijlstra
@ 2009-04-02 11:32   ` Ingo Molnar
  2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
  1 sibling, 0 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 11:32 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> By request, provide a way to request a wakeup every 'n' events instead
> of every page of output.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/perf_counter.h |    3 ++-
>  kernel/perf_counter.c        |   10 +++++++++-
>  2 files changed, 11 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6/include/linux/perf_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_counter.h
> +++ linux-2.6/include/linux/perf_counter.h
> @@ -146,7 +146,7 @@ struct perf_counter_hw_event {
>  				__reserved_1   : 52;
>  
>  	__u32			extra_config_len;
> -	__u32			__reserved_4;
> +	__u32			wakeup_events;	/* wakeup every n events */

[ Sidenote: i think we need more reserved bits :-) Expand the ABI
  while it's still easy ;-) ]

I have a design observation here:

i think 'wakeup' is a user-space notification type. Another type of 
notication can be 'signal', or 'none'.

Perhaps your current scheme makes the most sense: to treat them as 
separate attributes, and allow signal delivery and wakeups at once. 
(if user-space so wishes)

Agreed?

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02  9:12 ` [PATCH 5/6] perf_counter: add more context information Peter Zijlstra
@ 2009-04-02 11:36   ` Ingo Molnar
  2009-04-02 11:46     ` Peter Zijlstra
  2009-04-02 11:48     ` Peter Zijlstra
  2009-04-02 12:04   ` [tip:perfcounters/core] " Peter Zijlstra
  2009-04-03 12:50   ` [PATCH 5/6] " Peter Zijlstra
  2 siblings, 2 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 11:36 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Put in counts to tell which ips belong to what context.
> 
>   -----
>    | |  hv
>    | --
> nr | |  kernel
>    | --
>    | |  user
>   -----

btw., i have an observation about the format:

> -#define MAX_STACK_DEPTH		255
> +#define MAX_STACK_DEPTH		254
>  
>  struct perf_callchain_entry {
> -	u64	nr;
> +	u32	nr, hv, kernel, user;
>  	u64	ip[MAX_STACK_DEPTH];
>  };

For the special case of signal notifications, if the signal is 
delivered immediately to the same task that raised it (pid=0), the 
call chain is actually a still meaningful one: it is the stack that 
is below the currently executing signal handler context.

Wouldnt it make sense to record the full stack frame for that case, 
to allow walking/unwinding of the stack? Or can user-space do that 
just fine, based on its own signal context?

We are going to hard-code the "call-chain is a series of IPs, 
nothing else" model, and i'd like to make sure it's future-proof :)

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
  2009-04-02 11:28   ` Ingo Molnar
@ 2009-04-02 11:43   ` Ingo Molnar
  2009-04-02 11:47     ` Peter Zijlstra
  2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
  2009-04-02 22:33   ` [PATCH 1/6] " Corey Ashford
  3 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 11:43 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> -	PERF_EVENT_OVERFLOW	= 1UL << 31,
> +	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,

>  	header.type = PERF_EVENT_OVERFLOW;

i fixed the obvious sed failure there :)

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 11:36   ` Ingo Molnar
@ 2009-04-02 11:46     ` Peter Zijlstra
  2009-04-02 18:16       ` Ingo Molnar
  2009-04-02 11:48     ` Peter Zijlstra
  1 sibling, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 11:46 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Put in counts to tell which ips belong to what context.
> > 
> >   -----
> >    | |  hv
> >    | --
> > nr | |  kernel
> >    | --
> >    | |  user
> >   -----
> 
> btw., i have an observation about the format:
> 
> > -#define MAX_STACK_DEPTH		255
> > +#define MAX_STACK_DEPTH		254
> >  
> >  struct perf_callchain_entry {
> > -	u64	nr;
> > +	u32	nr, hv, kernel, user;
> >  	u64	ip[MAX_STACK_DEPTH];
> >  };
> 
> For the special case of signal notifications, if the signal is 
> delivered immediately to the same task that raised it (pid=0), the 
> call chain is actually a still meaningful one: it is the stack that 
> is below the currently executing signal handler context.
> 
> Wouldnt it make sense to record the full stack frame for that case, 
> to allow walking/unwinding of the stack? Or can user-space do that 
> just fine, based on its own signal context?

I think it can do that just fine or even better than we can -- userspace
having access to a full dwarf2 unwinder and such.

> We are going to hard-code the "call-chain is a series of IPs, 
> nothing else" model, and i'd like to make sure it's future-proof :)

I think it should be, function return addresses are the primary piece of
information here.


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02 11:43   ` Ingo Molnar
@ 2009-04-02 11:47     ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 11:47 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 13:43 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > -	PERF_EVENT_OVERFLOW	= 1UL << 31,
> > +	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
> 
> >  	header.type = PERF_EVENT_OVERFLOW;
> 
> i fixed the obvious sed failure there :)

Yeah, lost a refresh :/


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 11:36   ` Ingo Molnar
  2009-04-02 11:46     ` Peter Zijlstra
@ 2009-04-02 11:48     ` Peter Zijlstra
  2009-04-02 18:18       ` Ingo Molnar
  1 sibling, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 11:48 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:

> > -#define MAX_STACK_DEPTH		255
> > +#define MAX_STACK_DEPTH		254
> >  
> >  struct perf_callchain_entry {
> > -	u64	nr;
> > +	u32	nr, hv, kernel, user;
> >  	u64	ip[MAX_STACK_DEPTH];
> >  };

Oh, and Paul suggested using u16s right after I send it out. So I'll
either send an update or send a incremental in case you already applied
it.


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02 10:51   ` Ingo Molnar
@ 2009-04-02 11:48     ` Peter Zijlstra
  2009-04-02 12:26       ` Ingo Molnar
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 11:48 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 12:51 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > By request, provide a way for counters to disable themselves and 
> > signal at the first counter overflow.
> > 
> > This isn't complete, we really want pending work to be done ASAP 
> > after queueing it. My preferred method would be a self-IPI, that 
> > would ensure we run the code in a usable context right after the 
> > current (IRQ-off, NMI) context is done.
> 
> Hm. I do think self-IPIs can be fragile but the more work we do in 
> NMI context the more compelling of a case can be made for a 
> self-IPI. So no big arguments against that.

Its not only NMI, but also things like software events in the scheduler
under rq->lock, or hrtimers in irq context. You cannot do a wakeup from
under rq->lock, nor hrtimer_cancel() from within the timer handler.

All these nasty little issues stack up and could be solved with a
self-IPI.


Then there is the software task-time clock which uses
p->se.sum_exec_runtime which requires the rq->lock to be read. Coupling
this with for example an NMI overflow handler gives an instant deadlock.

Would you terribly mind if I remove all that sum_exec_runtime and
rq->lock stuff and simply use cpu_clock() to keep count. These things
get context switched along with tasks anyway.



> So i think we need 3 separate things:
> 
>  - the ability to set a signal attribute of the counter (during 
>    creation) via a (signo,tid) pair. 
> 
>    Semantics:
> 
>     - it can be a regular signal (signo < 32),
>       or an RT/queued signal (signo >= 32).
> 
>     - It may be sent to the task that generated the event (tid == 0), 
>       or it may be sent to a specific task (tid > 0),
>       or it may be sent to a task group (tid < 0).

kill_pid() seems to be able to do all of that:

        struct pid *pid;
        int tid, priv;

        perf_counter_disable(counter);

        rcu_read_lock();
        tid = counter->hw_event.signal_tid;
        if (!tid)
                tid = current->pid;
        priv = 1;
        if (tid < 0) {
                priv = 0;
                tid = -tid;
        }
        pid = find_vpid(tid);
        if (pid)
                kill_pid(pid, counter->hw_event.signal_nr, priv);
        rcu_read_unlock();

Should do I afaict.

Except I probably should look into this pid-namespace mess and clean all
that up.

>  - 'event limit' attribute: the ability to pause new events after N 
>    events. This limit auto-decrements on each event.
>    limit==1 is the special case for single-shot.

That should go along with a toggle on what an event is I suppose, either
an 'output' event or a filled page?

Or do we want to limit that to counter overflow?

>  - new ioctl method to refill the limit, when user-space is ready to 
>    receive new events. A special-case of this is when a signal 
>    handler calls ioctl(refill_limit, 1) in the single-shot case - 
>    this re-enables events after the signal has been handled.

Right, with the method implemented above, its simply a matter of the
enable ioctl.

> Another observation: i think perf_counter_output() needs to depend 
> on whether the counter is signalling, not on the single-shot-ness of 
> the counter.
> 
> A completely valid use of this would be for user-space to create an 
> mmap() buffer of 1024 events, then set the limit to 1024, and wait 
> for the 1024 events to happen - process them and close the counter. 
> Without any signalling.

Say we have a limit > 1, and a signal, that would mean we do not
generate event output?




^ permalink raw reply	[flat|nested] 58+ messages in thread

* [tip:perfcounters/core] perf_counter: move the event overflow output bits to record_type
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
  2009-04-02 11:28   ` Ingo Molnar
  2009-04-02 11:43   ` Ingo Molnar
@ 2009-04-02 12:03   ` Peter Zijlstra
  2009-04-02 22:33   ` [PATCH 1/6] " Corey Ashford
  3 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 12:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
	mingo

Commit-ID:  59f479bfec417dc9b532d4670d77d53d1a16766b
Gitweb:     http://git.kernel.org/tip/59f479bfec417dc9b532d4670d77d53d1a16766b
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 2 Apr 2009 11:11:59 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 2 Apr 2009 13:52:59 +0200

perf_counter: move the event overflow output bits to record_type

Per suggestion from Paul, move the event overflow bits to record_type
and sanitize the enums a bit.

Breaks the ABI -- again ;-)

Suggested-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090402091319.151921176@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |   50 ++++++++++++---------
 kernel/perf_counter.c        |  101 ++++++++++++++++-------------------------
 2 files changed, 68 insertions(+), 83 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 43083af..06a6fba 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,15 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
-};
-
 #define __PERF_COUNTER_MASK(name) 			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
@@ -103,6 +94,17 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+	PERF_RECORD_IP		= 1U << 0,
+	PERF_RECORD_TID		= 1U << 1,
+	PERF_RECORD_GROUP	= 1U << 2,
+	PERF_RECORD_CALLCHAIN	= 1U << 3,
+};
+
+/*
  * Bits that can be set in hw_event.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
@@ -125,8 +127,8 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
+	__u32			record_type;
+	__u32			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -137,12 +139,10 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
-				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 51;
+				__reserved_1   : 53;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -212,15 +212,21 @@ struct perf_event_header {
 
 enum perf_event_type {
 
-	PERF_EVENT_GROUP	= 1,
-
-	PERF_EVENT_MMAP		= 2,
-	PERF_EVENT_MUNMAP	= 3,
+	PERF_EVENT_MMAP			= 1,
+	PERF_EVENT_MUNMAP		= 2,
 
-	PERF_EVENT_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP		= 1UL << 30,
-	__PERF_EVENT_TID	= 1UL << 29,
-	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
+	/*
+	 * Half the event type space is reserved for the counter overflow
+	 * bitfields, as found in hw_event.record_type.
+	 *
+	 * These events will have types of the form:
+	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 */
+	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP			= PERF_RECORD_IP,
+	__PERF_EVENT_TID		= PERF_RECORD_TID,
+	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
+	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 860cdc2..995063d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void perf_output_simple(struct perf_counter *counter,
-			       int nmi, struct pt_regs *regs)
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
 {
 	int ret;
+	u64 record_type = counter->hw_event.record_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct {
+		u64 event;
+		u64 counter;
+	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 
-	header.type = PERF_EVENT_OVERFLOW;
+	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
-	ip = instruction_pointer(regs);
-	header.type |= __PERF_EVENT_IP;
-	header.size += sizeof(ip);
+	if (record_type & PERF_RECORD_IP) {
+		ip = instruction_pointer(regs);
+		header.type |= __PERF_EVENT_IP;
+		header.size += sizeof(ip);
+	}
 
-	if (counter->hw_event.include_tid) {
+	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
@@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
-	if (counter->hw_event.callchain) {
+	if (record_type & PERF_RECORD_GROUP) {
+		header.type |= __PERF_EVENT_GROUP;
+		header.size += sizeof(u64) +
+			counter->nr_siblings * sizeof(group_entry);
+	}
+
+	if (record_type & PERF_RECORD_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
@@ -1810,69 +1823,35 @@ static void perf_output_simple(struct perf_counter *counter,
 		return;
 
 	perf_output_put(&handle, header);
-	perf_output_put(&handle, ip);
 
-	if (counter->hw_event.include_tid)
-		perf_output_put(&handle, tid_entry);
+	if (record_type & PERF_RECORD_IP)
+		perf_output_put(&handle, ip);
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
-
-	perf_output_end(&handle);
-}
-
-static void perf_output_group(struct perf_counter *counter, int nmi)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_counter *leader, *sub;
-	unsigned int size;
-	struct {
-		u64 event;
-		u64 counter;
-	} entry;
-	int ret;
-
-	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+	if (record_type & PERF_RECORD_TID)
+		perf_output_put(&handle, tid_entry);
 
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		return;
+	if (record_type & PERF_RECORD_GROUP) {
+		struct perf_counter *leader, *sub;
+		u64 nr = counter->nr_siblings;
 
-	header.type = PERF_EVENT_GROUP;
-	header.size = size;
+		perf_output_put(&handle, nr);
 
-	perf_output_put(&handle, header);
+		leader = counter->group_leader;
+		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+			if (sub != counter)
+				sub->hw_ops->read(sub);
 
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
+			group_entry.event = sub->hw_event.config;
+			group_entry.counter = atomic64_read(&sub->count);
 
-		entry.event = sub->hw_event.config;
-		entry.counter = atomic64_read(&sub->count);
-
-		perf_output_put(&handle, entry);
+			perf_output_put(&handle, group_entry);
+		}
 	}
 
-	perf_output_end(&handle);
-}
-
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return;
-
-	case PERF_RECORD_IRQ:
-		perf_output_simple(counter, nmi, regs);
-		break;
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
 
-	case PERF_RECORD_GROUP:
-		perf_output_group(counter, nmi);
-		break;
-	}
+	perf_output_end(&handle);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:perfcounters/core] perf_counter: per event wakeups
  2009-04-02  9:12 ` [PATCH 3/6] perf_counter: per event wakeups Peter Zijlstra
  2009-04-02 11:32   ` Ingo Molnar
@ 2009-04-02 12:03   ` Peter Zijlstra
  1 sibling, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 12:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
	mingo

Commit-ID:  b202978dae5fce00d19480360f165a181be19704
Gitweb:     http://git.kernel.org/tip/b202978dae5fce00d19480360f165a181be19704
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 2 Apr 2009 11:12:01 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 2 Apr 2009 13:52:59 +0200

perf_counter: per event wakeups

By request, provide a way to request a wakeup every 'n' events instead
of every page of output.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090402091319.323309784@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |    3 ++-
 kernel/perf_counter.c        |   10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 06a6fba..5428ba1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -145,7 +145,7 @@ struct perf_counter_hw_event {
 				__reserved_1   : 53;
 
 	__u32			extra_config_len;
-	__u32			__reserved_4;
+	__u32			wakeup_events;	/* wakeup every n events */
 
 	__u64			__reserved_2;
 	__u64			__reserved_3;
@@ -321,6 +321,7 @@ struct perf_mmap_data {
 	int				nr_pages;
 	atomic_t			wakeup;
 	atomic_t			head;
+	atomic_t			events;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 995063d..9bcab10 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup)
+	int wakeup_events = handle->counter->hw_event.wakeup_events;
+
+	if (wakeup_events) {
+		int events = atomic_inc_return(&handle->data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &handle->data->events);
+			__perf_output_wakeup(handle);
+		}
+	} else if (handle->wakeup)
 		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02  9:12 ` [PATCH 4/6] perf_counter: kerneltop: update to new ABI Peter Zijlstra
@ 2009-04-02 12:03   ` Peter Zijlstra
  2009-04-02 13:35     ` Jaswinder Singh Rajput
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 12:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
	mingo

Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 2 Apr 2009 13:53:00 +0200

perf_counter: kerneltop: update to new ABI

Update to reflect the new record_type ABI changes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090402091319.407283141@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 Documentation/perf_counter/kerneltop.c |    9 ++++-----
 1 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 33b4fcf..4f8d791 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -442,7 +442,7 @@ static void create_perfstat_counter(int counter)
 
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
-	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.record_type	= 0;
 	hw_event.nmi		= 0;
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -1277,8 +1277,8 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		switch (event->header.type) {
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP:
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
 			process_event(event->ip.ip, md->counter);
 			break;
 
@@ -1337,9 +1337,8 @@ int main(int argc, char *argv[])
 			memset(&hw_event, 0, sizeof(hw_event));
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IRQ;
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
 			hw_event.nmi		= nmi;
-			hw_event.include_tid	= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:perfcounters/core] perf_counter: add more context information
  2009-04-02  9:12 ` [PATCH 5/6] perf_counter: add more context information Peter Zijlstra
  2009-04-02 11:36   ` Ingo Molnar
@ 2009-04-02 12:04   ` Peter Zijlstra
  2009-04-03 12:50   ` [PATCH 5/6] " Peter Zijlstra
  2 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 12:04 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
	mingo

Commit-ID:  13c47e37b33167d9784ffbb2c921f65279665dd7
Gitweb:     http://git.kernel.org/tip/13c47e37b33167d9784ffbb2c921f65279665dd7
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 2 Apr 2009 11:12:03 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 2 Apr 2009 13:53:00 +0200

perf_counter: add more context information

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090402091319.493101305@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 arch/x86/kernel/cpu/perf_counter.c |    9 +++++++++
 include/linux/perf_counter.h       |    4 ++--
 kernel/perf_counter.c              |    2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2a946a1..c74e20d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5428ba1..90cce0c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH		254
 
 struct perf_callchain_entry {
-	u64	nr;
+	u32	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9bcab10..f105a6e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
+			callchain_size = (2 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:perfcounters/core] perf_counter: update mmap() counter read
  2009-04-02  9:12 ` [PATCH 6/6] perf_counter: update mmap() counter read Peter Zijlstra
@ 2009-04-02 12:04   ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 12:04 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, cjashfor,
	mingo

Commit-ID:  895eadace59ff56e9b86e480c9b05da61822c82f
Gitweb:     http://git.kernel.org/tip/895eadace59ff56e9b86e480c9b05da61822c82f
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 2 Apr 2009 11:12:04 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 2 Apr 2009 13:53:01 +0200

perf_counter: update mmap() counter read

Paul noted that we don't need SMP barriers for the mmap() counter read
because its always on the same cpu (otherwise you can't access the hw
counter anyway).

So remove the SMP barriers and replace them with regular compiler
barriers.

Further, update the comment to include a race free method of reading
said hardware counter. The primary change is putting the pmc_read
inside the seq-loop, otherwise we can still race and read rubbish.

Noticed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090402091319.577951445@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |   22 ++++++++++------------
 kernel/perf_counter.c        |    4 ++--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 90cce0c..f2b914d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -167,30 +167,28 @@ struct perf_counter_mmap_page {
 	/*
 	 * Bits needed to read the hw counters in user-space.
 	 *
-	 * The index and offset should be read atomically using the seqlock:
-	 *
-	 *   __u32 seq, index;
-	 *   __s64 offset;
+	 *   u32 seq;
+	 *   s64 count;
 	 *
 	 * again:
-	 *   rmb();
 	 *   seq = pc->lock;
-	 *
 	 *   if (unlikely(seq & 1)) {
 	 *     cpu_relax();
 	 *     goto again;
 	 *   }
 	 *
-	 *   index = pc->index;
-	 *   offset = pc->offset;
+	 *   if (pc->index) {
+	 *     count = pmc_read(pc->index - 1);
+	 *     count += pc->offset;
+	 *   } else
+	 *     goto regular_read;
 	 *
-	 *   rmb();
+	 *   barrier();
 	 *   if (pc->lock != seq)
 	 *     goto again;
 	 *
-	 * After this, index contains architecture specific counter index + 1,
-	 * so that 0 means unavailable, offset contains the value to be added
-	 * to the result of the raw timer read to obtain this counter's value.
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
 	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f105a6e..2a5d4f5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	 */
 	preempt_disable();
 	++userpg->lock;
-	smp_wmb();
+	barrier();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	smp_wmb();
+	barrier();
 	++userpg->lock;
 	preempt_enable();
 unlock:

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02  9:12 ` [PATCH 2/6] RFC perf_counter: singleshot support Peter Zijlstra
  2009-04-02 10:51   ` Ingo Molnar
@ 2009-04-02 12:18   ` Peter Zijlstra
  2009-04-02 18:10     ` Ingo Molnar
  1 sibling, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 12:18 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

OK, so I just remembered the sigio thing mentioned by Paul a while back.

and this should do I suppose...

---
Subject: perf_counter: SIGIO support
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu Apr 02 14:15:54 CEST 2009


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |    2 ++
 kernel/perf_counter.c        |   20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -238,6 +238,7 @@ enum perf_event_type {
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
+#include <linux/fs.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -397,6 +398,7 @@ struct perf_counter {
 
 	/* poll related */
 	wait_queue_head_t		waitq;
+	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
 	struct perf_wakeup_entry	wakeup;
 
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1526,6 +1526,22 @@ out:
 	return ret;
 }
 
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+	struct perf_counter *counter = filp->private_data;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &counter->fasync);
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
@@ -1533,6 +1549,7 @@ static const struct file_operations perf
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
 	.mmap			= perf_mmap,
+	.fasync			= perf_fasync,
 };
 
 /*
@@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_cou
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data) {
-		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		atomic_set(&data->wakeup, POLL_IN);
 		/*
 		 * Ensure all data writes are issued before updating the
 		 * user-space data head information. The matching rmb()
@@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_cou
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
+	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
 /*



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02 11:48     ` Peter Zijlstra
@ 2009-04-02 12:26       ` Ingo Molnar
  2009-04-02 21:23         ` Paul Mackerras
  0 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 12:26 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Thu, 2009-04-02 at 12:51 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > By request, provide a way for counters to disable themselves and 
> > > signal at the first counter overflow.
> > > 
> > > This isn't complete, we really want pending work to be done ASAP 
> > > after queueing it. My preferred method would be a self-IPI, that 
> > > would ensure we run the code in a usable context right after the 
> > > current (IRQ-off, NMI) context is done.
> > 
> > Hm. I do think self-IPIs can be fragile but the more work we do 
> > in NMI context the more compelling of a case can be made for a 
> > self-IPI. So no big arguments against that.
> 
> Its not only NMI, but also things like software events in the 
> scheduler under rq->lock, or hrtimers in irq context. You cannot 
> do a wakeup from under rq->lock, nor hrtimer_cancel() from within 
> the timer handler.
> 
> All these nasty little issues stack up and could be solved with a 
> self-IPI.
>
> Then there is the software task-time clock which uses 
> p->se.sum_exec_runtime which requires the rq->lock to be read. 
> Coupling this with for example an NMI overflow handler gives an 
> instant deadlock.

Ok, convinced.

> Would you terribly mind if I remove all that sum_exec_runtime and 
> rq->lock stuff and simply use cpu_clock() to keep count. These 
> things get context switched along with tasks anyway.

Sure. One sidenote - the precision of sw clocks has dropped a bit 
lately:

aldebaran:~/linux/linux/Documentation/perf_counter> ./perfstat -e 
1:0 -e 1:0 -e 1:0 -e 1:0 -e 1:0 sleep 1

 Performance counter stats for 'sleep':

       0.762664  cpu clock ticks      (msecs)
       0.761440  cpu clock ticks      (msecs)
       0.760977  cpu clock ticks      (msecs)
       0.760587  cpu clock ticks      (msecs)
       0.760287  cpu clock ticks      (msecs)

 Wall-clock time elapsed:  1003.139373 msecs

See that slight but noticeable skew? This used to work fine and we 
had the exact same value everywhere. Can we fix that while still 
keeping the code nice?

> Except I probably should look into this pid-namespace mess and 
> clean all that up.

yeah. Hopefully it's all just a matter of adding or removing a 'v' 
somewhere. Get a bit more complicated with system-wide counters 
though.

> >  - 'event limit' attribute: the ability to pause new events after N 
> >    events. This limit auto-decrements on each event.
> >    limit==1 is the special case for single-shot.
> 
> That should go along with a toggle on what an event is I suppose, 
> either an 'output' event or a filled page?
> 
> Or do we want to limit that to counter overflow?

I think the proper form to rate-limit events and do buffering, 
without losing events, is to have an attribute that sets a 
buffer-full event threshold in bytes. That works well with variable 
sized records. That threshold would normally be set to a multiple of 
PAGE_SIZE - with a sensible default of half the mmap area or so?

Right?

> >  - new ioctl method to refill the limit, when user-space is ready to 
> >    receive new events. A special-case of this is when a signal 
> >    handler calls ioctl(refill_limit, 1) in the single-shot case - 
> >    this re-enables events after the signal has been handled.
> 
> Right, with the method implemented above, its simply a matter of 
> the enable ioctl.

ok.

> > Another observation: i think perf_counter_output() needs to 
> > depend on whether the counter is signalling, not on the 
> > single-shot-ness of the counter.
> > 
> > A completely valid use of this would be for user-space to create 
> > an mmap() buffer of 1024 events, then set the limit to 1024, and 
> > wait for the 1024 events to happen - process them and close the 
> > counter. Without any signalling.
> 
> Say we have a limit > 1, and a signal, that would mean we do not 
> generate event output?

I think we should have two independent limits that both may generate 
wakeups.

We have a stream of events filling in records in a buffer area. That 
is a given and we have no real influence over them happening (in a 
loss free model).

There's two further, independent properties here that make further 
sense to manage:

 1) what happens on the events themselves

 2) the buffer space gets squeezed

Here we have buffering and hence discretion over what happens, how 
frequently we wake up and what we do on each individual event.

For the #2 buffer space, in the view of variable size records, the 
best metric is bytes i think. The best default is 'half of the mmap 
area'. This should influence the wakeup behavior IMO. We only wake 
up if buffer space gets tight. (User-space can time out its poll() 
call and thus get a timely recording of even smaller-than-threshold 
events)

For the #1 'what happens on events' independent case, by default is 
that nothing happens. If the signal number is set, we send a signal 
- but the buffer space management itself remains independent and we 
may or may not wake up, depending on the 'bytes left' metric.

I think the 'trigger limit' threshold is a third independent 
attribute which actively throttles output [be that a signal, output 
into the buffer space, or both] - if despite the wakeup (or us 
sending a signal) nothing happened and we've got too much overlap.

The most common special case for the trigger limit would be in 
signal generation mode, with a value of 1. This means the counter 
turns off after each signal.

Remember the 'lost events' value patch in the header mmap area? This 
would be useful here: if the kernel has to throttle due to hitting 
the limit, it would set the overflow counter?

If this gets needlessly complex/weird in the code itself then i made 
a thinko somewhere and we need to reconsider. :-)

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
@ 2009-04-02 13:35     ` Jaswinder Singh Rajput
  2009-04-02 13:59       ` Jaswinder Singh Rajput
  0 siblings, 1 reply; 58+ messages in thread
From: Jaswinder Singh Rajput @ 2009-04-02 13:35 UTC (permalink / raw)
  To: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	mingo
  Cc: linux-tip-commits

On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> Committer:  Ingo Molnar <mingo@elte.hu>
> CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> 
> perf_counter: kerneltop: update to new ABI
> 
> Update to reflect the new record_type ABI changes.
> 

perfstat is still having many issues:

1. 0:6: bus-cycles is not valid for AMD, so it fails

2. ./perfstat -e 0:1,0:2,0:3,0:4,0:5,0:6 ls
can be replaced by ./perfstat -e 0:* ls

3. Similarly ./perfstat -e 1:1,1:2,1:3,1:4,1:5,1:6 ls
can be replaced by ./perfstat -e 1:* ls

4. All events can be replaced by ./perfstat -e * ls

--
JSR


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 13:35     ` Jaswinder Singh Rajput
@ 2009-04-02 13:59       ` Jaswinder Singh Rajput
  2009-04-02 18:11         ` Ingo Molnar
  0 siblings, 1 reply; 58+ messages in thread
From: Jaswinder Singh Rajput @ 2009-04-02 13:59 UTC (permalink / raw)
  To: mingo
  Cc: hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor, mingo,
	linux-tip-commits

On Thu, 2009-04-02 at 19:05 +0530, Jaswinder Singh Rajput wrote:
> On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> > Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> > Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> > Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> > AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> > Committer:  Ingo Molnar <mingo@elte.hu>
> > CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> > 
> > perf_counter: kerneltop: update to new ABI
> > 
> > Update to reflect the new record_type ABI changes.
> > 
> 
> perfstat is still having many issues:
> 
> 1. 0:6: bus-cycles is not valid for AMD, so it fails
> 
> 2. ./perfstat -e 0:1,0:2,0:3,0:4,0:5,0:6 ls
> can be replaced by ./perfstat -e 0:* ls
> 
> 3. Similarly ./perfstat -e 1:1,1:2,1:3,1:4,1:5,1:6 ls
> can be replaced by ./perfstat -e 1:* ls
> 
> 4. All events can be replaced by ./perfstat -e * ls
> 

5. This command is invalid:
  Sample output:

   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null

--
JSR


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02 12:18   ` Peter Zijlstra
@ 2009-04-02 18:10     ` Ingo Molnar
  2009-04-02 18:33       ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 18:10 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> OK, so I just remembered the sigio thing mentioned by Paul a while 
> back.
> 
> and this should do I suppose...

> @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_cou
>  	rcu_read_unlock();
>  
>  	wake_up_all(&counter->waitq);
> +	kill_fasync(&counter->fasync, SIGIO, POLL_IN);

ah, yes.

Do we even need the explicit signo method this way? The SIGIO target 
is configurable via fcntl(F_SETSIG), right?

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 13:59       ` Jaswinder Singh Rajput
@ 2009-04-02 18:11         ` Ingo Molnar
  2009-04-02 18:22           ` Jaswinder Singh Rajput
  0 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 18:11 UTC (permalink / raw)
  To: Jaswinder Singh Rajput
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits


* Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:

> On Thu, 2009-04-02 at 19:05 +0530, Jaswinder Singh Rajput wrote:
> > On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> > > Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> > > Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> > > Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> > > Committer:  Ingo Molnar <mingo@elte.hu>
> > > CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> > > 
> > > perf_counter: kerneltop: update to new ABI
> > > 
> > > Update to reflect the new record_type ABI changes.
> > > 
> > 
> > perfstat is still having many issues:
> > 
> > 1. 0:6: bus-cycles is not valid for AMD, so it fails
> > 
> > 2. ./perfstat -e 0:1,0:2,0:3,0:4,0:5,0:6 ls
> > can be replaced by ./perfstat -e 0:* ls
> > 
> > 3. Similarly ./perfstat -e 1:1,1:2,1:3,1:4,1:5,1:6 ls
> > can be replaced by ./perfstat -e 1:* ls
> > 
> > 4. All events can be replaced by ./perfstat -e * ls
> > 
> 
> 5. This command is invalid:
>   Sample output:
> 
>    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null

yes, this should be fixed in perfstat: '-e 1' should be accepted as 
'-e 0:1'. Patches welcome.

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 11:46     ` Peter Zijlstra
@ 2009-04-02 18:16       ` Ingo Molnar
  0 siblings, 0 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 18:16 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > Put in counts to tell which ips belong to what context.
> > > 
> > >   -----
> > >    | |  hv
> > >    | --
> > > nr | |  kernel
> > >    | --
> > >    | |  user
> > >   -----
> > 
> > btw., i have an observation about the format:
> > 
> > > -#define MAX_STACK_DEPTH		255
> > > +#define MAX_STACK_DEPTH		254
> > >  
> > >  struct perf_callchain_entry {
> > > -	u64	nr;
> > > +	u32	nr, hv, kernel, user;
> > >  	u64	ip[MAX_STACK_DEPTH];
> > >  };
> > 
> > For the special case of signal notifications, if the signal is 
> > delivered immediately to the same task that raised it (pid=0), the 
> > call chain is actually a still meaningful one: it is the stack that 
> > is below the currently executing signal handler context.
> > 
> > Wouldnt it make sense to record the full stack frame for that 
> > case, to allow walking/unwinding of the stack? Or can user-space 
> > do that just fine, based on its own signal context?
> 
> I think it can do that just fine or even better than we can -- 
> userspace having access to a full dwarf2 unwinder and such.

eventually we'll have one in the kernel too, but yeah, user-space 
can do this better. It will have precise details about the runtime 
environment.

And any async mechanism has no chance to do anything useful with 
stack frame info anyway - that stack frame might be long gone.

> > We are going to hard-code the "call-chain is a series of IPs, 
> > nothing else" model, and i'd like to make sure it's future-proof
> > :)
> 
> I think it should be, function return addresses are the primary 
> piece of information here.

ok - good - just wanted to make sure :)

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 11:48     ` Peter Zijlstra
@ 2009-04-02 18:18       ` Ingo Molnar
  2009-04-02 18:29         ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 18:18 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> 
> > > -#define MAX_STACK_DEPTH		255
> > > +#define MAX_STACK_DEPTH		254
> > >  
> > >  struct perf_callchain_entry {
> > > -	u64	nr;
> > > +	u32	nr, hv, kernel, user;
> > >  	u64	ip[MAX_STACK_DEPTH];
> > >  };
> 
> Oh, and Paul suggested using u16s right after I send it out. So 
> I'll either send an update or send a incremental in case you 
> already applied it.

yes, that's probably a good idea. Although u8 might be even better - 
do we ever want to do more than 256 deep stack vectors? Even those 
would take quite some time to construct and pass down.

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 18:11         ` Ingo Molnar
@ 2009-04-02 18:22           ` Jaswinder Singh Rajput
  2009-04-02 18:28             ` Ingo Molnar
  2009-04-02 18:32             ` Jaswinder Singh Rajput
  0 siblings, 2 replies; 58+ messages in thread
From: Jaswinder Singh Rajput @ 2009-04-02 18:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits

On Thu, 2009-04-02 at 20:11 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> 
> > On Thu, 2009-04-02 at 19:05 +0530, Jaswinder Singh Rajput wrote:
> > > On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> > > > Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> > > > Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> > > > Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > > AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> > > > Committer:  Ingo Molnar <mingo@elte.hu>
> > > > CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> > > > 
> > > > perf_counter: kerneltop: update to new ABI
> > > > 
> > > > Update to reflect the new record_type ABI changes.
> > > > 
> > > 
> > > perfstat is still having many issues:
> > > 
> > > 1. 0:6: bus-cycles is not valid for AMD, so it fails
> > > 
> > > 2. ./perfstat -e 0:1,0:2,0:3,0:4,0:5,0:6 ls
> > > can be replaced by ./perfstat -e 0:* ls
> > > 
> > > 3. Similarly ./perfstat -e 1:1,1:2,1:3,1:4,1:5,1:6 ls
> > > can be replaced by ./perfstat -e 1:* ls
> > > 
> > > 4. All events can be replaced by ./perfstat -e * ls
> > > 
> > 
> > 5. This command is invalid:
> >   Sample output:
> > 
> >    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
> 
> yes, this should be fixed in perfstat: '-e 1' should be accepted as 
> '-e 0:1'. Patches welcome.
> 

if you treat '-e 1' as 'e 0:1' then how you will treat '-e 1:1' ?

I think 'e 1:*' is better option to select all software events and '-e
0:*' for all hardware events.

Or '-e 1' will treat as both '-e 0:1,1:1'

--
JSR



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 18:22           ` Jaswinder Singh Rajput
@ 2009-04-02 18:28             ` Ingo Molnar
  2009-04-02 18:38               ` Jaswinder Singh Rajput
  2009-04-02 18:51               ` Jaswinder Singh Rajput
  2009-04-02 18:32             ` Jaswinder Singh Rajput
  1 sibling, 2 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 18:28 UTC (permalink / raw)
  To: Jaswinder Singh Rajput
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits


* Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:

> On Thu, 2009-04-02 at 20:11 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> > 
> > > On Thu, 2009-04-02 at 19:05 +0530, Jaswinder Singh Rajput wrote:
> > > > On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> > > > > Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> > > > > Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> > > > > Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > > > AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> > > > > Committer:  Ingo Molnar <mingo@elte.hu>
> > > > > CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> > > > > 
> > > > > perf_counter: kerneltop: update to new ABI
> > > > > 
> > > > > Update to reflect the new record_type ABI changes.
> > > > > 
> > > > 
> > > > perfstat is still having many issues:
> > > > 
> > > > 1. 0:6: bus-cycles is not valid for AMD, so it fails
> > > > 
> > > > 2. ./perfstat -e 0:1,0:2,0:3,0:4,0:5,0:6 ls
> > > > can be replaced by ./perfstat -e 0:* ls
> > > > 
> > > > 3. Similarly ./perfstat -e 1:1,1:2,1:3,1:4,1:5,1:6 ls
> > > > can be replaced by ./perfstat -e 1:* ls
> > > > 
> > > > 4. All events can be replaced by ./perfstat -e * ls
> > > > 
> > > 
> > > 5. This command is invalid:
> > >   Sample output:
> > > 
> > >    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
> > 
> > yes, this should be fixed in perfstat: '-e 1' should be accepted as 
> > '-e 0:1'. Patches welcome.
> > 
> 
> if you treat '-e 1' as 'e 0:1' then how you will treat '-e 1:1' ?
>
> I think 'e 1:*' is better option to select all software events and 
> '-e 0:*' for all hardware events.

That is not what i suggested though. I suggested '-e 1' to mean to 
default to the more common case: hw counters. I.e.:

 -e 1    ==    -e 0:1
 -e 2    ==    -e 0:2
 -e 3    ==    -e 0:3
 ...

sw counters could be specified only via the longer form.

Anyway, most people will use symbolic names anyway (-e cycles, 
etc.), so i suspect the issue is mostly irrelevant.

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 18:18       ` Ingo Molnar
@ 2009-04-02 18:29         ` Peter Zijlstra
  2009-04-02 18:34           ` Ingo Molnar
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 18:29 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 20:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> > 
> > > > -#define MAX_STACK_DEPTH		255
> > > > +#define MAX_STACK_DEPTH		254
> > > >  
> > > >  struct perf_callchain_entry {
> > > > -	u64	nr;
> > > > +	u32	nr, hv, kernel, user;
> > > >  	u64	ip[MAX_STACK_DEPTH];
> > > >  };
> > 
> > Oh, and Paul suggested using u16s right after I send it out. So 
> > I'll either send an update or send a incremental in case you 
> > already applied it.
> 
> yes, that's probably a good idea. Although u8 might be even better - 
> do we ever want to do more than 256 deep stack vectors? Even those 
> would take quite some time to construct and pass down.

We'd have to pad it with 4 more bytes to remain u64 aligned, also, why
restrict ourselves. That MAX_STACK_DEPTH limit is trivially fixable if
indeed someone finds its insufficient.




^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 18:22           ` Jaswinder Singh Rajput
  2009-04-02 18:28             ` Ingo Molnar
@ 2009-04-02 18:32             ` Jaswinder Singh Rajput
  1 sibling, 0 replies; 58+ messages in thread
From: Jaswinder Singh Rajput @ 2009-04-02 18:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits

On Thu, 2009-04-02 at 23:52 +0530, Jaswinder Singh Rajput wrote:
> On Thu, 2009-04-02 at 20:11 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> > 
> > > On Thu, 2009-04-02 at 19:05 +0530, Jaswinder Singh Rajput wrote:
> > > > On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> > > > > Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> > > > > Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> > > > > Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > > > AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> > > > > Committer:  Ingo Molnar <mingo@elte.hu>
> > > > > CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> > > > > 
> > > > > perf_counter: kerneltop: update to new ABI
> > > > > 
> > > > > Update to reflect the new record_type ABI changes.
> > > > > 
> > > > 
> > > > perfstat is still having many issues:
> > > > 
> > > > 1. 0:6: bus-cycles is not valid for AMD, so it fails
> > > > 
> > > > 2. ./perfstat -e 0:1,0:2,0:3,0:4,0:5,0:6 ls
> > > > can be replaced by ./perfstat -e 0:* ls
> > > > 
> > > > 3. Similarly ./perfstat -e 1:1,1:2,1:3,1:4,1:5,1:6 ls
> > > > can be replaced by ./perfstat -e 1:* ls
> > > > 
> > > > 4. All events can be replaced by ./perfstat -e * ls
> > > > 
> > > 
> > > 5. This command is invalid:
> > >   Sample output:
> > > 
> > >    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
> > 
> > yes, this should be fixed in perfstat: '-e 1' should be accepted as 
> > '-e 0:1'. Patches welcome.
> > 
> 

Better I will show in pictorial way to avoid any confusion:

> if you treat '-e 1' as 'e 0:1' then how you will treat '-e 1:1' ?
> 

'-e 1' ---> '-e 0:1'

  ?    ---> '-e 1:1'

> I think 'e 1:*' is better option to select all software events and '-e
> 0:*' for all hardware events.
> 

'-e 0:*' ---> select all hardware events.
'-e 1:*' ---> select all software events.

> Or '-e 1' will treat as both '-e 0:1,1:1'
> 

'-e 1' ---> '-e 0:1,1:0'

--
JSR


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02 18:10     ` Ingo Molnar
@ 2009-04-02 18:33       ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 18:33 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 20:10 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > OK, so I just remembered the sigio thing mentioned by Paul a while 
> > back.
> > 
> > and this should do I suppose...
> 
> > @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_cou
> >  	rcu_read_unlock();
> >  
> >  	wake_up_all(&counter->waitq);
> > +	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
> 
> ah, yes.
> 
> Do we even need the explicit signo method this way? The SIGIO target 
> is configurable via fcntl(F_SETSIG), right?

F_SETSIG allows you to change the signal that is delivered, SIGIO by
default, F_SETOWN allows you to specify a target.




^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 18:29         ` Peter Zijlstra
@ 2009-04-02 18:34           ` Ingo Molnar
  2009-04-02 18:42             ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 18:34 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Thu, 2009-04-02 at 20:18 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> > > 
> > > > > -#define MAX_STACK_DEPTH		255
> > > > > +#define MAX_STACK_DEPTH		254
> > > > >  
> > > > >  struct perf_callchain_entry {
> > > > > -	u64	nr;
> > > > > +	u32	nr, hv, kernel, user;
> > > > >  	u64	ip[MAX_STACK_DEPTH];
> > > > >  };
> > > 
> > > Oh, and Paul suggested using u16s right after I send it out. So 
> > > I'll either send an update or send a incremental in case you 
> > > already applied it.
> > 
> > yes, that's probably a good idea. Although u8 might be even better - 
> > do we ever want to do more than 256 deep stack vectors? Even those 
> > would take quite some time to construct and pass down.
> 
> We'd have to pad it with 4 more bytes to remain u64 aligned,

ok, indeed.

> [...] also, why restrict ourselves. That MAX_STACK_DEPTH limit is 
> trivially fixable if indeed someone finds its insufficient.

well .. think about it: walking more than 256 stack frames for every 
IRQ event? Getting backtraces like:

 <func_0+0x123>
 <func_1+0x123>
 <func_2+0x123>
 <func_3+0x123>
 <func_4+0x123>
 <func_5+0x123>
 <func_6+0x123>
 <func_7+0x123>
 <func_8+0x123>
 <func_9+0x123>
 <func_10+0x123>
 <func_11+0x123>
 <func_12+0x123>
 <func_13+0x123>
 <func_14+0x123>
 <func_15+0x123>
 <func_16+0x123>
 <func_17+0x123>
 <func_18+0x123>
 <func_19+0x123>
 <func_20+0x123>
 <func_21+0x123>
 <func_22+0x123>
 <func_23+0x123>
 <func_24+0x123>
 <func_25+0x123>
 <func_26+0x123>
 <func_27+0x123>
 <func_28+0x123>
 <func_29+0x123>
 <func_30+0x123>
 <func_31+0x123>
 <func_32+0x123>
 <func_33+0x123>
 <func_34+0x123>
 <func_35+0x123>
 <func_36+0x123>
 <func_37+0x123>
 <func_38+0x123>
 <func_39+0x123>
 <func_40+0x123>
 <func_41+0x123>
 <func_42+0x123>
 <func_43+0x123>
 <func_44+0x123>
 <func_45+0x123>
 <func_46+0x123>
 <func_47+0x123>
 <func_48+0x123>
 <func_49+0x123>
 <func_50+0x123>
 <func_51+0x123>
 <func_52+0x123>
 <func_53+0x123>
 <func_54+0x123>
 <func_55+0x123>
 <func_56+0x123>
 <func_57+0x123>
 <func_58+0x123>
 <func_59+0x123>
 <func_60+0x123>
 <func_61+0x123>
 <func_62+0x123>
 <func_63+0x123>
 <func_64+0x123>
 <func_65+0x123>
 <func_66+0x123>
 <func_67+0x123>
 <func_68+0x123>
 <func_69+0x123>
 <func_70+0x123>
 <func_71+0x123>
 <func_72+0x123>
 <func_73+0x123>
 <func_74+0x123>
 <func_75+0x123>
 <func_76+0x123>
 <func_77+0x123>
 <func_78+0x123>
 <func_79+0x123>
 <func_80+0x123>
 <func_81+0x123>
 <func_82+0x123>
 <func_83+0x123>
 <func_84+0x123>
 <func_85+0x123>
 <func_86+0x123>
 <func_87+0x123>
 <func_88+0x123>
 <func_89+0x123>
 <func_90+0x123>
 <func_91+0x123>
 <func_92+0x123>
 <func_93+0x123>
 <func_94+0x123>
 <func_95+0x123>
 <func_96+0x123>
 <func_97+0x123>
 <func_98+0x123>
 <func_99+0x123>
 <func_100+0x123>
 <func_101+0x123>
 <func_102+0x123>
 <func_103+0x123>
 <func_104+0x123>
 <func_105+0x123>
 <func_106+0x123>
 <func_107+0x123>
 <func_108+0x123>
 <func_109+0x123>
 <func_110+0x123>
 <func_111+0x123>
 <func_112+0x123>
 <func_113+0x123>
 <func_114+0x123>
 <func_115+0x123>
 <func_116+0x123>
 <func_117+0x123>
 <func_118+0x123>
 <func_119+0x123>
 <func_120+0x123>
 <func_121+0x123>
 <func_122+0x123>
 <func_123+0x123>
 <func_124+0x123>
 <func_125+0x123>
 <func_126+0x123>
 <func_127+0x123>
 <func_128+0x123>
 <func_129+0x123>
 <func_130+0x123>
 <func_131+0x123>
 <func_132+0x123>
 <func_133+0x123>
 <func_134+0x123>
 <func_135+0x123>
 <func_136+0x123>
 <func_137+0x123>
 <func_138+0x123>
 <func_139+0x123>
 <func_140+0x123>
 <func_141+0x123>
 <func_142+0x123>
 <func_143+0x123>
 <func_144+0x123>
 <func_145+0x123>
 <func_146+0x123>
 <func_147+0x123>
 <func_148+0x123>
 <func_149+0x123>
 <func_150+0x123>
 <func_151+0x123>
 <func_152+0x123>
 <func_153+0x123>
 <func_154+0x123>
 <func_155+0x123>
 <func_156+0x123>
 <func_157+0x123>
 <func_158+0x123>
 <func_159+0x123>
 <func_160+0x123>
 <func_161+0x123>
 <func_162+0x123>
 <func_163+0x123>
 <func_164+0x123>
 <func_165+0x123>
 <func_166+0x123>
 <func_167+0x123>
 <func_168+0x123>
 <func_169+0x123>
 <func_170+0x123>
 <func_171+0x123>
 <func_172+0x123>
 <func_173+0x123>
 <func_174+0x123>
 <func_175+0x123>
 <func_176+0x123>
 <func_177+0x123>
 <func_178+0x123>
 <func_179+0x123>
 <func_180+0x123>
 <func_181+0x123>
 <func_182+0x123>
 <func_183+0x123>
 <func_184+0x123>
 <func_185+0x123>
 <func_186+0x123>
 <func_187+0x123>
 <func_188+0x123>
 <func_189+0x123>
 <func_190+0x123>
 <func_191+0x123>
 <func_192+0x123>
 <func_193+0x123>
 <func_194+0x123>
 <func_195+0x123>
 <func_196+0x123>
 <func_197+0x123>
 <func_198+0x123>
 <func_199+0x123>
 <func_200+0x123>
 <func_201+0x123>
 <func_202+0x123>
 <func_203+0x123>
 <func_204+0x123>
 <func_205+0x123>
 <func_206+0x123>
 <func_207+0x123>
 <func_208+0x123>
 <func_209+0x123>
 <func_210+0x123>
 <func_211+0x123>
 <func_212+0x123>
 <func_213+0x123>
 <func_214+0x123>
 <func_215+0x123>
 <func_216+0x123>
 <func_217+0x123>
 <func_218+0x123>
 <func_219+0x123>
 <func_220+0x123>
 <func_221+0x123>
 <func_222+0x123>
 <func_223+0x123>
 <func_224+0x123>
 <func_225+0x123>
 <func_226+0x123>
 <func_227+0x123>
 <func_228+0x123>
 <func_229+0x123>
 <func_230+0x123>
 <func_231+0x123>
 <func_232+0x123>
 <func_233+0x123>
 <func_234+0x123>
 <func_235+0x123>
 <func_236+0x123>
 <func_237+0x123>
 <func_238+0x123>
 <func_239+0x123>
 <func_240+0x123>
 <func_241+0x123>
 <func_242+0x123>
 <func_243+0x123>
 <func_244+0x123>
 <func_245+0x123>
 <func_246+0x123>
 <func_247+0x123>
 <func_248+0x123>
 <func_249+0x123>
 <func_250+0x123>
 <func_251+0x123>
 <func_252+0x123>
 <func_253+0x123>
 <func_254+0x123>
 <func_255+0x123>
 <func_256+0x123>
 <func_257+0x123>
 <func_258+0x123>
 <func_259+0x123>
 <func_260+0x123>
 <func_261+0x123>
 <func_262+0x123>
 <func_263+0x123>
 <func_264+0x123>
 <func_265+0x123>
 <func_266+0x123>
 <func_267+0x123>
 <func_268+0x123>
 <func_269+0x123>

does that make much sense _per event_? How do you visualize it?

But yeah ... i could imagine some user-space craziness and since we 
want to align to u64 i guess that pretty much settles it to u16.

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 18:28             ` Ingo Molnar
@ 2009-04-02 18:38               ` Jaswinder Singh Rajput
  2009-04-02 19:20                 ` Ingo Molnar
  2009-04-02 18:51               ` Jaswinder Singh Rajput
  1 sibling, 1 reply; 58+ messages in thread
From: Jaswinder Singh Rajput @ 2009-04-02 18:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits

On Thu, 2009-04-02 at 20:28 +0200, Ingo Molnar wrote:

> That is not what i suggested though. I suggested '-e 1' to mean to 
> default to the more common case: hw counters. I.e.:
> 
>  -e 1    ==    -e 0:1
>  -e 2    ==    -e 0:2
>  -e 3    ==    -e 0:3
>  ...
> 
> sw counters could be specified only via the longer form.
> 

If someone want to see all supported event counters. Then what is your
suggestion, can we represent some shortcut for it.

--
JSR


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 18:34           ` Ingo Molnar
@ 2009-04-02 18:42             ` Peter Zijlstra
  2009-04-02 19:19               ` Ingo Molnar
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-02 18:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 20:34 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Thu, 2009-04-02 at 20:18 +0200, Ingo Molnar wrote:
> > > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > 
> > > > On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> > > > 
> > > > > > -#define MAX_STACK_DEPTH		255
> > > > > > +#define MAX_STACK_DEPTH		254
> > > > > >  
> > > > > >  struct perf_callchain_entry {
> > > > > > -	u64	nr;
> > > > > > +	u32	nr, hv, kernel, user;
> > > > > >  	u64	ip[MAX_STACK_DEPTH];
> > > > > >  };
> > > > 
> > > > Oh, and Paul suggested using u16s right after I send it out. So 
> > > > I'll either send an update or send a incremental in case you 
> > > > already applied it.
> > > 
> > > yes, that's probably a good idea. Although u8 might be even better - 
> > > do we ever want to do more than 256 deep stack vectors? Even those 
> > > would take quite some time to construct and pass down.
> > 
> > We'd have to pad it with 4 more bytes to remain u64 aligned,
> 
> ok, indeed.
> 
> > [...] also, why restrict ourselves. That MAX_STACK_DEPTH limit is 
> > trivially fixable if indeed someone finds its insufficient.
> 
> well .. think about it: walking more than 256 stack frames for every 
> IRQ event? Getting backtraces like:
> 
>  <func_0+0x123>
...
>  <func_269+0x123>
> 
> does that make much sense _per event_? How do you visualize it?

You can use it to calculate aggregate times. Eg. attribute the time
spend in func_0 to func_1 to func_2 etc. And use a tree view based on
these call-chains, allowing you to drill-down -- which is basically what
the sysprof GUI does.


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 18:28             ` Ingo Molnar
  2009-04-02 18:38               ` Jaswinder Singh Rajput
@ 2009-04-02 18:51               ` Jaswinder Singh Rajput
  1 sibling, 0 replies; 58+ messages in thread
From: Jaswinder Singh Rajput @ 2009-04-02 18:51 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits

On Thu, 2009-04-02 at 20:28 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> 
> > On Thu, 2009-04-02 at 20:11 +0200, Ingo Molnar wrote:
> > > * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> > > 
> > > > On Thu, 2009-04-02 at 19:05 +0530, Jaswinder Singh Rajput wrote:
> > > > > On Thu, 2009-04-02 at 12:03 +0000, Peter Zijlstra wrote:
> > > > > > Commit-ID:  373b153677f26a263ef297d77a5e045a31f6486c
> > > > > > Gitweb:     http://git.kernel.org/tip/373b153677f26a263ef297d77a5e045a31f6486c
> > > > > > Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > > > > AuthorDate: Thu, 2 Apr 2009 11:12:02 +0200
> > > > > > Committer:  Ingo Molnar <mingo@elte.hu>
> > > > > > CommitDate: Thu, 2 Apr 2009 13:53:00 +0200
> > > > > > 
> > > > > > perf_counter: kerneltop: update to new ABI
> > > > > > 
> > > > > > Update to reflect the new record_type ABI changes.
> > > > > > 
> > > > > 
> > > > > perfstat is still having many issues:
> > > > > 
> > > > > 1. 0:6: bus-cycles is not valid for AMD, so it fails
> > > > > 

This is the major issue.

bus-cycles is not valid for AMD and it returns error. And I am not able
to find any relevant event for AMD.

How can we suppress this error on AMD Box.

--
JSR


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02 18:42             ` Peter Zijlstra
@ 2009-04-02 19:19               ` Ingo Molnar
  0 siblings, 0 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 19:19 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Mackerras, Corey Ashford, linux-kernel


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Thu, 2009-04-02 at 20:34 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > On Thu, 2009-04-02 at 20:18 +0200, Ingo Molnar wrote:
> > > > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > > 
> > > > > On Thu, 2009-04-02 at 13:36 +0200, Ingo Molnar wrote:
> > > > > 
> > > > > > > -#define MAX_STACK_DEPTH		255
> > > > > > > +#define MAX_STACK_DEPTH		254
> > > > > > >  
> > > > > > >  struct perf_callchain_entry {
> > > > > > > -	u64	nr;
> > > > > > > +	u32	nr, hv, kernel, user;
> > > > > > >  	u64	ip[MAX_STACK_DEPTH];
> > > > > > >  };
> > > > > 
> > > > > Oh, and Paul suggested using u16s right after I send it out. So 
> > > > > I'll either send an update or send a incremental in case you 
> > > > > already applied it.
> > > > 
> > > > yes, that's probably a good idea. Although u8 might be even better - 
> > > > do we ever want to do more than 256 deep stack vectors? Even those 
> > > > would take quite some time to construct and pass down.
> > > 
> > > We'd have to pad it with 4 more bytes to remain u64 aligned,
> > 
> > ok, indeed.
> > 
> > > [...] also, why restrict ourselves. That MAX_STACK_DEPTH limit is 
> > > trivially fixable if indeed someone finds its insufficient.
> > 
> > well .. think about it: walking more than 256 stack frames for every 
> > IRQ event? Getting backtraces like:
> > 
> >  <func_0+0x123>
> ...
> >  <func_269+0x123>
> > 
> > does that make much sense _per event_? How do you visualize it?
> 
> You can use it to calculate aggregate times. Eg. attribute the 
> time spend in func_0 to func_1 to func_2 etc. And use a tree view 
> based on these call-chains, allowing you to drill-down -- which is 
> basically what the sysprof GUI does.

yeah - but at a depth of more than 256?

(and who'd ever use more than 640K RAM anyway ;-)

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [tip:perfcounters/core] perf_counter: kerneltop: update to new ABI
  2009-04-02 18:38               ` Jaswinder Singh Rajput
@ 2009-04-02 19:20                 ` Ingo Molnar
  0 siblings, 0 replies; 58+ messages in thread
From: Ingo Molnar @ 2009-04-02 19:20 UTC (permalink / raw)
  To: Jaswinder Singh Rajput
  Cc: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx, cjashfor,
	linux-tip-commits


* Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:

> On Thu, 2009-04-02 at 20:28 +0200, Ingo Molnar wrote:
> 
> > That is not what i suggested though. I suggested '-e 1' to mean to 
> > default to the more common case: hw counters. I.e.:
> > 
> >  -e 1    ==    -e 0:1
> >  -e 2    ==    -e 0:2
> >  -e 3    ==    -e 0:3
> >  ...
> > 
> > sw counters could be specified only via the longer form.
> > 
> 
> If someone want to see all supported event counters. Then what is 
> your suggestion, can we represent some shortcut for it.

the help text already displays something like that although it is 
not dynamic.

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 2/6] RFC perf_counter: singleshot support
  2009-04-02 12:26       ` Ingo Molnar
@ 2009-04-02 21:23         ` Paul Mackerras
  0 siblings, 0 replies; 58+ messages in thread
From: Paul Mackerras @ 2009-04-02 21:23 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Peter Zijlstra, Corey Ashford, linux-kernel

Ingo Molnar writes:

> Sure. One sidenote - the precision of sw clocks has dropped a bit 
> lately:
> 
> aldebaran:~/linux/linux/Documentation/perf_counter> ./perfstat -e 
> 1:0 -e 1:0 -e 1:0 -e 1:0 -e 1:0 sleep 1
> 
>  Performance counter stats for 'sleep':
> 
>        0.762664  cpu clock ticks      (msecs)
>        0.761440  cpu clock ticks      (msecs)
>        0.760977  cpu clock ticks      (msecs)
>        0.760587  cpu clock ticks      (msecs)
>        0.760287  cpu clock ticks      (msecs)
> 
>  Wall-clock time elapsed:  1003.139373 msecs
> 
> See that slight but noticeable skew? This used to work fine and we 
> had the exact same value everywhere. Can we fix that while still 
> keeping the code nice?

I suggest basing the software clock on counter->ctx->time_now, and
make get_context_time use cpu_clock() always.  That way we will only
call cpu_clock() once even if we have multiple cpu clock counters, and
that will eliminate the skew as well as being more efficient.

Paul.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
                     ` (2 preceding siblings ...)
  2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
@ 2009-04-02 22:33   ` Corey Ashford
  2009-04-02 23:27     ` Corey Ashford
  3 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-02 22:33 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

Peter Zijlstra wrote:
> Per suggestion from Paul, move the event overflow bits to record_type
> and sanitize the enums a bit.
> 
> Breaks the ABI -- again ;-)
> 
[snip]

With this patch, the definitions look like this now:

[snip]
/*
  * Bits that can be set in hw_event.record_type to request information
  * in the overflow packets.
  */
enum perf_counter_record_format {
         PERF_RECORD_IP          = 1U << 0,
         PERF_RECORD_TID         = 1U << 1,
         PERF_RECORD_GROUP       = 1U << 2,
         PERF_RECORD_CALLCHAIN   = 1U << 3,
};

[snip]
enum perf_event_type {

         PERF_EVENT_MMAP                 = 1,
         PERF_EVENT_MUNMAP               = 2,

         /*
          * Half the event type space is reserved for the counter overflow
          * bitfields, as found in hw_event.record_type.
          *
          * These events will have types of the form:
          *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
          */
         PERF_EVENT_COUNTER_OVERFLOW     = 1UL << 31,
         __PERF_EVENT_IP                 = PERF_RECORD_IP,
         __PERF_EVENT_TID                = PERF_RECORD_TID,
         __PERF_EVENT_GROUP              = PERF_RECORD_GROUP,
         __PERF_EVENT_CALLCHAIN          = PERF_RECORD_CALLCHAIN,
};
[snip]

Unless I'm misreading something here, there's overlap in the enum values 
of perf_event_type enum. PERF_EVENT_MMAP has the same value as 
__PERF_EVENT_IP, and PERF_EVENT_MUNMAP has the same value as 
__PERF_EVENT_TID.

Are these lower bits being reused when PERF_EVENT_COUNTER_OVERFLOW is 
OR'd in, which would imply that PERF_EVENT_MMAP and PERF_EVENT_MUNMAP 
are mutually exclusive with all of the PERF_EVENT_COUNTER_OVERFLOW values.

Actually, I don't really understand the purpose of the PERF_EVENT_MMAP 
and PERF_EVENT_MUNMAP bits. My hazy understanding is that they are used 
for finding the file, function and line number at overflow interrupt 
time, but it's unclear to me what that has to do with mmap.  I'll go 
back and try to find the relevant patch notes again.

- Corey

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02 22:33   ` [PATCH 1/6] " Corey Ashford
@ 2009-04-02 23:27     ` Corey Ashford
  2009-04-03  6:50       ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-02 23:27 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

Whoops, nevermind.

My misunderstanding on this one.  This enum is used for event type, not 
the record_type, and as such is makes sense for there to be exclusive 
mmap and munmap event records.

Thinking about this a bit, I'm guessing that the idea is to track the 
loading and unloading of shared objects which uses mmap and munmap, so 
that that the ip can be related to a particular object that was mapped 
in at the time of the counter overflow interrupt.  Is that right?

- Corey

Corey Ashford wrote:
> Peter Zijlstra wrote:
>> Per suggestion from Paul, move the event overflow bits to record_type
>> and sanitize the enums a bit.
>>
>> Breaks the ABI -- again ;-)
>>
> [snip]
> 
> With this patch, the definitions look like this now:
> 
> [snip]
> /*
>  * Bits that can be set in hw_event.record_type to request information
>  * in the overflow packets.
>  */
> enum perf_counter_record_format {
>         PERF_RECORD_IP          = 1U << 0,
>         PERF_RECORD_TID         = 1U << 1,
>         PERF_RECORD_GROUP       = 1U << 2,
>         PERF_RECORD_CALLCHAIN   = 1U << 3,
> };
> 
> [snip]
> enum perf_event_type {
> 
>         PERF_EVENT_MMAP                 = 1,
>         PERF_EVENT_MUNMAP               = 2,
> 
>         /*
>          * Half the event type space is reserved for the counter overflow
>          * bitfields, as found in hw_event.record_type.
>          *
>          * These events will have types of the form:
>          *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
>          */
>         PERF_EVENT_COUNTER_OVERFLOW     = 1UL << 31,
>         __PERF_EVENT_IP                 = PERF_RECORD_IP,
>         __PERF_EVENT_TID                = PERF_RECORD_TID,
>         __PERF_EVENT_GROUP              = PERF_RECORD_GROUP,
>         __PERF_EVENT_CALLCHAIN          = PERF_RECORD_CALLCHAIN,
> };
> [snip]
> 
> 
> Unless I'm misreading something here, there's overlap in the enum values 
> of perf_event_type enum. PERF_EVENT_MMAP has the same value as 
> __PERF_EVENT_IP, and PERF_EVENT_MUNMAP has the same value as 
> __PERF_EVENT_TID.
> 
> Are these lower bits being reused when PERF_EVENT_COUNTER_OVERFLOW is 
> OR'd in, which would imply that PERF_EVENT_MMAP and PERF_EVENT_MUNMAP 
> are mutually exclusive with all of the PERF_EVENT_COUNTER_OVERFLOW values.
> 
> Actually, I don't really understand the purpose of the PERF_EVENT_MMAP 
> and PERF_EVENT_MUNMAP bits. My hazy understanding is that they are used 
> for finding the file, function and line number at overflow interrupt 
> time, but it's unclear to me what that has to do with mmap.  I'll go 
> back and try to find the relevant patch notes again.
> 
> - Corey
> 
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Regards,

- Corey

Corey Ashford
Software Engineer
IBM Linux Technology Center, Linux Toolchain
Beaverton, OR
503-578-3507
cjashfor@us.ibm.com


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-02 23:27     ` Corey Ashford
@ 2009-04-03  6:50       ` Peter Zijlstra
  2009-04-03  7:30         ` Corey Ashford
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-03  6:50 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Thu, 2009-04-02 at 16:27 -0700, Corey Ashford wrote:
> Whoops, nevermind.
> 
> My misunderstanding on this one.  This enum is used for event type, not 
> the record_type, and as such is makes sense for there to be exclusive 
> mmap and munmap event records.
> 
> Thinking about this a bit, I'm guessing that the idea is to track the 
> loading and unloading of shared objects which uses mmap and munmap, so 
> that that the ip can be related to a particular object that was mapped 
> in at the time of the counter overflow interrupt.  Is that right?

Indeed, whenever a mmap/munmap happens of a PROT_EXEC range we record
that information so that we can relate the userspace IPs to some file
and hence the actual userspace code.




^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 1/6] perf_counter: move the event overflow output bits to record_type
  2009-04-03  6:50       ` Peter Zijlstra
@ 2009-04-03  7:30         ` Corey Ashford
  0 siblings, 0 replies; 58+ messages in thread
From: Corey Ashford @ 2009-04-03  7:30 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

Peter Zijlstra wrote:
> On Thu, 2009-04-02 at 16:27 -0700, Corey Ashford wrote:
>> Whoops, nevermind.
>>
>> My misunderstanding on this one.  This enum is used for event type, not 
>> the record_type, and as such is makes sense for there to be exclusive 
>> mmap and munmap event records.
>>
>> Thinking about this a bit, I'm guessing that the idea is to track the 
>> loading and unloading of shared objects which uses mmap and munmap, so 
>> that that the ip can be related to a particular object that was mapped 
>> in at the time of the counter overflow interrupt.  Is that right?
> 
> Indeed, whenever a mmap/munmap happens of a PROT_EXEC range we record
> that information so that we can relate the userspace IPs to some file
> and hence the actual userspace code.

Ah, I see, PROT_EXEC identifies the executable code sections in particular.

Thanks :-)

- Corey


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-02  9:12 ` [PATCH 5/6] perf_counter: add more context information Peter Zijlstra
  2009-04-02 11:36   ` Ingo Molnar
  2009-04-02 12:04   ` [tip:perfcounters/core] " Peter Zijlstra
@ 2009-04-03 12:50   ` Peter Zijlstra
  2009-04-03 18:25     ` Corey Ashford
  2 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-03 12:50 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Paul Mackerras, Corey Ashford, linux-kernel

On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
> plain text document attachment (perf_counter_callchain_context.patch)
> Put in counts to tell which ips belong to what context.
> 
>   -----
>    | |  hv
>    | --
> nr | |  kernel
>    | --
>    | |  user
>   -----

Right, just realized that PERF_RECORD_IP needs something similar if one
if not able to derive the context from the IP itself..




^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-03 12:50   ` [PATCH 5/6] " Peter Zijlstra
@ 2009-04-03 18:25     ` Corey Ashford
  2009-04-06 11:01       ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-03 18:25 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

Peter Zijlstra wrote:
> On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
>> plain text document attachment (perf_counter_callchain_context.patch)
>> Put in counts to tell which ips belong to what context.
>>
>>   -----
>>    | |  hv
>>    | --
>> nr | |  kernel
>>    | --
>>    | |  user
>>   -----
> 
> Right, just realized that PERF_RECORD_IP needs something similar if one
> if not able to derive the context from the IP itself..
> 
Three individual bits would suffice, or you could use a two-bit code -
00 = user
01 = kernel
10 = hypervisor
11 = reserved (or perhaps unknown)

Unfortunately, because of alignment, it would need to take up another 64 
bit word, wouldn't it?  Too bad you cannot sneak the bits into the IP in 
a machine independent way.

And since you probably need a separate word, that effectively doubles 
the amount of space taken up by IP samples (if we add a "no event 
header" option).  Should we add another bit in the record_type field - 
PERF_RECORD_IP_LEVEL (or similar) so that user-space apps don't have to 
get this if they don't need it?

Regards,

- Corey

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-03 18:25     ` Corey Ashford
@ 2009-04-06 11:01       ` Peter Zijlstra
  2009-04-06 11:07         ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-06 11:01 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Fri, 2009-04-03 at 11:25 -0700, Corey Ashford wrote:
> Peter Zijlstra wrote:
> > On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
> >> plain text document attachment (perf_counter_callchain_context.patch)
> >> Put in counts to tell which ips belong to what context.
> >>
> >>   -----
> >>    | |  hv
> >>    | --
> >> nr | |  kernel
> >>    | --
> >>    | |  user
> >>   -----
> > 
> > Right, just realized that PERF_RECORD_IP needs something similar if one
> > if not able to derive the context from the IP itself..
> > 
> Three individual bits would suffice, or you could use a two-bit code -
> 00 = user
> 01 = kernel
> 10 = hypervisor
> 11 = reserved (or perhaps unknown)
> 
> Unfortunately, because of alignment, it would need to take up another 64 
> bit word, wouldn't it?  Too bad you cannot sneak the bits into the IP in 
> a machine independent way.
> 
> And since you probably need a separate word, that effectively doubles 
> the amount of space taken up by IP samples (if we add a "no event 
> header" option).  Should we add another bit in the record_type field - 
> PERF_RECORD_IP_LEVEL (or similar) so that user-space apps don't have to 
> get this if they don't need it?

If we limit the event size to 64k (surely enough, right? :-), then we
have 16 more bits to play with in the header, and we could do something
like the below.

A further possibility would also be to add an overflow bit in there,
making the full 32bit PERF_RECORD space available to output events as
well.

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -201,9 +201,17 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+enum {
+	PERF_EVENT_LEVEL_HV	= 0,
+	PERF_EVENT_LEVEL_KERNEL = 1,
+	PERF_EVENT_LEVEL_USER	= 2,
+};
+
 struct perf_event_header {
 	__u32	type;
-	__u32	size;
+	__u16	level		:  2,
+		__reserved	: 14;
+	__u16	size;
 };
 
 enum perf_event_type {
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1832,6 +1832,8 @@ static void perf_counter_output(struct p
 
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
+	header.level = user_mode(regs) ? 
+		PERF_EVENT_LEVEL_USER : PERF_EVENT_LEVEL_KERNEL;
 
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 11:01       ` Peter Zijlstra
@ 2009-04-06 11:07         ` Peter Zijlstra
  2009-04-06 18:53           ` Corey Ashford
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-06 11:07 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Mon, 2009-04-06 at 13:01 +0200, Peter Zijlstra wrote:
> On Fri, 2009-04-03 at 11:25 -0700, Corey Ashford wrote:
> > Peter Zijlstra wrote:
> > > On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
> > >> plain text document attachment (perf_counter_callchain_context.patch)
> > >> Put in counts to tell which ips belong to what context.
> > >>
> > >>   -----
> > >>    | |  hv
> > >>    | --
> > >> nr | |  kernel
> > >>    | --
> > >>    | |  user
> > >>   -----
> > > 
> > > Right, just realized that PERF_RECORD_IP needs something similar if one
> > > if not able to derive the context from the IP itself..
> > > 
> > Three individual bits would suffice, or you could use a two-bit code -
> > 00 = user
> > 01 = kernel
> > 10 = hypervisor
> > 11 = reserved (or perhaps unknown)
> > 
> > Unfortunately, because of alignment, it would need to take up another 64 
> > bit word, wouldn't it?  Too bad you cannot sneak the bits into the IP in 
> > a machine independent way.
> > 
> > And since you probably need a separate word, that effectively doubles 
> > the amount of space taken up by IP samples (if we add a "no event 
> > header" option).  Should we add another bit in the record_type field - 
> > PERF_RECORD_IP_LEVEL (or similar) so that user-space apps don't have to 
> > get this if they don't need it?
> 
> If we limit the event size to 64k (surely enough, right? :-), then we
> have 16 more bits to play with in the header, and we could do something
> like the below.
> 
> A further possibility would also be to add an overflow bit in there,
> making the full 32bit PERF_RECORD space available to output events as
> well.
> 
> Index: linux-2.6/include/linux/perf_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_counter.h
> +++ linux-2.6/include/linux/perf_counter.h
> @@ -201,9 +201,17 @@ struct perf_counter_mmap_page {
>  	__u32   data_head;		/* head in the data section */
>  };
>  
> +enum {
> +	PERF_EVENT_LEVEL_HV	= 0,
> +	PERF_EVENT_LEVEL_KERNEL = 1,
> +	PERF_EVENT_LEVEL_USER	= 2,
> +};
> +
>  struct perf_event_header {
>  	__u32	type;
> -	__u32	size;
> +	__u16	level		:  2,
> +		__reserved	: 14;
> +	__u16	size;
>  };

Except we should probably use masks again instead of bitfields so that
the thing is portable when streamed to disk, such as would be common
with splice().

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 11:07         ` Peter Zijlstra
@ 2009-04-06 18:53           ` Corey Ashford
  2009-04-06 19:06             ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-06 18:53 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel



Peter Zijlstra wrote:
> On Mon, 2009-04-06 at 13:01 +0200, Peter Zijlstra wrote:
>> On Fri, 2009-04-03 at 11:25 -0700, Corey Ashford wrote:
>>> Peter Zijlstra wrote:
>>>> On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
>>>>> plain text document attachment (perf_counter_callchain_context.patch)
>>>>> Put in counts to tell which ips belong to what context.
>>>>>
>>>>>   -----
>>>>>    | |  hv
>>>>>    | --
>>>>> nr | |  kernel
>>>>>    | --
>>>>>    | |  user
>>>>>   -----
>>>> Right, just realized that PERF_RECORD_IP needs something similar if one
>>>> if not able to derive the context from the IP itself..
>>>>
>>> Three individual bits would suffice, or you could use a two-bit code -
>>> 00 = user
>>> 01 = kernel
>>> 10 = hypervisor
>>> 11 = reserved (or perhaps unknown)
>>>
>>> Unfortunately, because of alignment, it would need to take up another 64 
>>> bit word, wouldn't it?  Too bad you cannot sneak the bits into the IP in 
>>> a machine independent way.
>>>
>>> And since you probably need a separate word, that effectively doubles 
>>> the amount of space taken up by IP samples (if we add a "no event 
>>> header" option).  Should we add another bit in the record_type field - 
>>> PERF_RECORD_IP_LEVEL (or similar) so that user-space apps don't have to 
>>> get this if they don't need it?
>> If we limit the event size to 64k (surely enough, right? :-), then we
>> have 16 more bits to play with in the header, and we could do something
>> like the below.
>>
>> A further possibility would also be to add an overflow bit in there,
>> making the full 32bit PERF_RECORD space available to output events as
>> well.
>>
>> Index: linux-2.6/include/linux/perf_counter.h
>> ===================================================================
>> --- linux-2.6.orig/include/linux/perf_counter.h
>> +++ linux-2.6/include/linux/perf_counter.h
>> @@ -201,9 +201,17 @@ struct perf_counter_mmap_page {
>>  	__u32   data_head;		/* head in the data section */
>>  };
>>  
>> +enum {
>> +	PERF_EVENT_LEVEL_HV	= 0,
>> +	PERF_EVENT_LEVEL_KERNEL = 1,
>> +	PERF_EVENT_LEVEL_USER	= 2,
>> +};
>> +
>>  struct perf_event_header {
>>  	__u32	type;
>> -	__u32	size;
>> +	__u16	level		:  2,
>> +		__reserved	: 14;
>> +	__u16	size;
>>  };
> 
> Except we should probably use masks again instead of bitfields so that
> the thing is portable when streamed to disk, such as would be common
> with splice().

One downside of this approach is that you if you specify "no header" 
(currently not possible, but maybe later?), you will not be able to get 
the level bits.

How about adding an optional, 64-bit "miscellaneous" word to the event 
record which could contain a number of small bit fields, any or all of 
which could be enabled with a PERF_RECORD_* bit.  If one or more of the 
miscellaneous PERF_RECORD_* bits are set to enable, this assembled word 
would be added to the record.  So the space cost of the level field goes 
down as we add more small fields that need to be recorded.

Something like:

  PERF_RECORD_LEVEL = 1U << 4,
  PERF_RECORD_INTR_DEPTH = 1U << 5,
  PERF_RECORD_STUFF = 1U << 6,
  ...

#define __PERF_MISC_MASK(name)                       \
         (((1ULL << PERF_MISC_##name##_BITS) - 1) <<  \
          PERF_MISC_##name##_SHIFT)

#define PERF_MISC_LEVEL_BITS 2
#define PERF_MISC_LEVEL_SHIFT 0
#define PERF_MISC_LEVEL_MASK __PERF_MISC_MASK(LEVEL)

#define PERF_MISC_INTR_DEPTH_BITS 8
#define PERF_MISC_INTR_DEPTH_SHIFT 2
#define PERF_MISC_INTR_DEPTH_MASK __PERF_MISC_MASK(INTR_DEPTH)

etc.


Regards,

- Corey

Corey Ashford
Software Engineer
IBM Linux Technology Center, Linux Toolchain
Beaverton, OR
503-578-3507
cjashfor@us.ibm.com


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 18:53           ` Corey Ashford
@ 2009-04-06 19:06             ` Peter Zijlstra
  2009-04-06 20:16               ` Corey Ashford
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-06 19:06 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Mon, 2009-04-06 at 11:53 -0700, Corey Ashford wrote:
> 
> Peter Zijlstra wrote:
> > On Mon, 2009-04-06 at 13:01 +0200, Peter Zijlstra wrote:
> >> On Fri, 2009-04-03 at 11:25 -0700, Corey Ashford wrote:
> >>> Peter Zijlstra wrote:
> >>>> On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
> >>>>> plain text document attachment (perf_counter_callchain_context.patch)
> >>>>> Put in counts to tell which ips belong to what context.
> >>>>>
> >>>>>   -----
> >>>>>    | |  hv
> >>>>>    | --
> >>>>> nr | |  kernel
> >>>>>    | --
> >>>>>    | |  user
> >>>>>   -----
> >>>> Right, just realized that PERF_RECORD_IP needs something similar if one
> >>>> if not able to derive the context from the IP itself..
> >>>>
> >>> Three individual bits would suffice, or you could use a two-bit code -
> >>> 00 = user
> >>> 01 = kernel
> >>> 10 = hypervisor
> >>> 11 = reserved (or perhaps unknown)
> >>>
> >>> Unfortunately, because of alignment, it would need to take up another 64 
> >>> bit word, wouldn't it?  Too bad you cannot sneak the bits into the IP in 
> >>> a machine independent way.
> >>>
> >>> And since you probably need a separate word, that effectively doubles 
> >>> the amount of space taken up by IP samples (if we add a "no event 
> >>> header" option).  Should we add another bit in the record_type field - 
> >>> PERF_RECORD_IP_LEVEL (or similar) so that user-space apps don't have to 
> >>> get this if they don't need it?
> >> If we limit the event size to 64k (surely enough, right? :-), then we
> >> have 16 more bits to play with in the header, and we could do something
> >> like the below.
> >>
> >> A further possibility would also be to add an overflow bit in there,
> >> making the full 32bit PERF_RECORD space available to output events as
> >> well.
> >>
> >> Index: linux-2.6/include/linux/perf_counter.h
> >> ===================================================================
> >> --- linux-2.6.orig/include/linux/perf_counter.h
> >> +++ linux-2.6/include/linux/perf_counter.h
> >> @@ -201,9 +201,17 @@ struct perf_counter_mmap_page {
> >>  	__u32   data_head;		/* head in the data section */
> >>  };
> >>  
> >> +enum {
> >> +	PERF_EVENT_LEVEL_HV	= 0,
> >> +	PERF_EVENT_LEVEL_KERNEL = 1,
> >> +	PERF_EVENT_LEVEL_USER	= 2,
> >> +};
> >> +
> >>  struct perf_event_header {
> >>  	__u32	type;
> >> -	__u32	size;
> >> +	__u16	level		:  2,
> >> +		__reserved	: 14;
> >> +	__u16	size;
> >>  };
> > 
> > Except we should probably use masks again instead of bitfields so that
> > the thing is portable when streamed to disk, such as would be common
> > with splice().
> 
> One downside of this approach is that you if you specify "no header" 
> (currently not possible, but maybe later?), you will not be able to get 
> the level bits.

Would this be desirable? I know we've mentioned it before, but it would
mean one cannot mix various event types (currently that means !mmap and
callchain with difficulty).

As long as we mandate this header, we can have 16 misc bits.

> How about adding an optional, 64-bit "miscellaneous" word to the event 
> record which could contain a number of small bit fields, any or all of 
> which could be enabled with a PERF_RECORD_* bit.  If one or more of the 
> miscellaneous PERF_RECORD_* bits are set to enable, this assembled word 
> would be added to the record.  So the space cost of the level field goes 
> down as we add more small fields that need to be recorded.
> 
> Something like:
> 
>   PERF_RECORD_LEVEL = 1U << 4,
>   PERF_RECORD_INTR_DEPTH = 1U << 5,
>   PERF_RECORD_STUFF = 1U << 6,
>   ...
> 
> #define __PERF_MISC_MASK(name)                       \
>          (((1ULL << PERF_MISC_##name##_BITS) - 1) <<  \
>           PERF_MISC_##name##_SHIFT)
> 
> #define PERF_MISC_LEVEL_BITS 2
> #define PERF_MISC_LEVEL_SHIFT 0
> #define PERF_MISC_LEVEL_MASK __PERF_MISC_MASK(LEVEL)
> 
> #define PERF_MISC_INTR_DEPTH_BITS 8
> #define PERF_MISC_INTR_DEPTH_SHIFT 2
> #define PERF_MISC_INTR_DEPTH_MASK __PERF_MISC_MASK(INTR_DEPTH)

Yeah, that's the alternative.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 19:06             ` Peter Zijlstra
@ 2009-04-06 20:16               ` Corey Ashford
  2009-04-06 20:46                 ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-06 20:16 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

Peter Zijlstra wrote:
> On Mon, 2009-04-06 at 11:53 -0700, Corey Ashford wrote:
>> Peter Zijlstra wrote:
>>> On Mon, 2009-04-06 at 13:01 +0200, Peter Zijlstra wrote:
>>>> On Fri, 2009-04-03 at 11:25 -0700, Corey Ashford wrote:
>>>>> Peter Zijlstra wrote:
>>>>>> On Thu, 2009-04-02 at 11:12 +0200, Peter Zijlstra wrote:
>>>>>>> plain text document attachment (perf_counter_callchain_context.patch)
>>>>>>> Put in counts to tell which ips belong to what context.
>>>>>>>
>>>>>>>   -----
>>>>>>>    | |  hv
>>>>>>>    | --
>>>>>>> nr | |  kernel
>>>>>>>    | --
>>>>>>>    | |  user
>>>>>>>   -----
>>>>>> Right, just realized that PERF_RECORD_IP needs something similar if one
>>>>>> if not able to derive the context from the IP itself..
>>>>>>
>>>>> Three individual bits would suffice, or you could use a two-bit code -
>>>>> 00 = user
>>>>> 01 = kernel
>>>>> 10 = hypervisor
>>>>> 11 = reserved (or perhaps unknown)
>>>>>
>>>>> Unfortunately, because of alignment, it would need to take up another 64 
>>>>> bit word, wouldn't it?  Too bad you cannot sneak the bits into the IP in 
>>>>> a machine independent way.
>>>>>
>>>>> And since you probably need a separate word, that effectively doubles 
>>>>> the amount of space taken up by IP samples (if we add a "no event 
>>>>> header" option).  Should we add another bit in the record_type field - 
>>>>> PERF_RECORD_IP_LEVEL (or similar) so that user-space apps don't have to 
>>>>> get this if they don't need it?
>>>> If we limit the event size to 64k (surely enough, right? :-), then we
>>>> have 16 more bits to play with in the header, and we could do something
>>>> like the below.
>>>>
>>>> A further possibility would also be to add an overflow bit in there,
>>>> making the full 32bit PERF_RECORD space available to output events as
>>>> well.
>>>>
>>>> Index: linux-2.6/include/linux/perf_counter.h
>>>> ===================================================================
>>>> --- linux-2.6.orig/include/linux/perf_counter.h
>>>> +++ linux-2.6/include/linux/perf_counter.h
>>>> @@ -201,9 +201,17 @@ struct perf_counter_mmap_page {
>>>>  	__u32   data_head;		/* head in the data section */
>>>>  };
>>>>  
>>>> +enum {
>>>> +	PERF_EVENT_LEVEL_HV	= 0,
>>>> +	PERF_EVENT_LEVEL_KERNEL = 1,
>>>> +	PERF_EVENT_LEVEL_USER	= 2,
>>>> +};
>>>> +
>>>>  struct perf_event_header {
>>>>  	__u32	type;
>>>> -	__u32	size;
>>>> +	__u16	level		:  2,
>>>> +		__reserved	: 14;
>>>> +	__u16	size;
>>>>  };
>>> Except we should probably use masks again instead of bitfields so that
>>> the thing is portable when streamed to disk, such as would be common
>>> with splice().
>> One downside of this approach is that you if you specify "no header" 
>> (currently not possible, but maybe later?), you will not be able to get 
>> the level bits.
> 
> Would this be desirable? I know we've mentioned it before, but it would
> mean one cannot mix various event types (currently that means !mmap and
> callchain with difficulty).

I think it would.  For one use case I'm working on right now, simple 
profiling, all I need are ip's.  If I could omit the header, that would 
reduce the frequency of sigio's by a factor of three, and make it faster 
to read up the ip's when the SIGIO's occur.

I realize that it makes it impossible to mix record types with the 
header removed, and skipping over the call chain data a bit more 
difficult (but not rocket science).

It could be made an error for the caller to specify both "no header" and 
perf_coiunter_hw_event.mmap|munmap


> 
> As long as we mandate this header, we can have 16 misc bits.
> 

True.

- Corey


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 20:16               ` Corey Ashford
@ 2009-04-06 20:46                 ` Peter Zijlstra
  2009-04-06 21:15                   ` Corey Ashford
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-06 20:46 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Mon, 2009-04-06 at 13:16 -0700, Corey Ashford wrote:

> >> One downside of this approach is that you if you specify "no header" 
> >> (currently not possible, but maybe later?), you will not be able to get 
> >> the level bits.
> > 
> > Would this be desirable? 

> I think it would.  For one use case I'm working on right now, simple 
> profiling, all I need are ip's.  If I could omit the header, that would 
> reduce the frequency of sigio's by a factor of three, and make it faster 
> to read up the ip's when the SIGIO's occur.

Self-profiling?

So you're interested in getting the smallest possible record size, that
would still be 2 u64, right? Otherwise you don't get the IP context that
started this.



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 20:46                 ` Peter Zijlstra
@ 2009-04-06 21:15                   ` Corey Ashford
  2009-04-06 21:21                     ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-06 21:15 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel



Peter Zijlstra wrote:
> On Mon, 2009-04-06 at 13:16 -0700, Corey Ashford wrote:
> 
>>>> One downside of this approach is that you if you specify "no header" 
>>>> (currently not possible, but maybe later?), you will not be able to get 
>>>> the level bits.
>>> Would this be desirable? 
> 
>> I think it would.  For one use case I'm working on right now, simple 
>> profiling, all I need are ip's.  If I could omit the header, that would 
>> reduce the frequency of sigio's by a factor of three, and make it faster 
>> to read up the ip's when the SIGIO's occur.
> 
> Self-profiling?
> 
> So you're interested in getting the smallest possible record size, that
> would still be 2 u64, right? Otherwise you don't get the IP context that
> started this.
> 
> 

Self-profiling mainly, yes.  PAPI specs an ability for remote monitoring 
of processes and threads, but I think it's only partially implemented.

So when you are talking about IP context, you mean pid/tid?

Regards,

- Corey

Corey Ashford
Software Engineer
IBM Linux Technology Center, Linux Toolchain
Beaverton, OR
503-578-3507
cjashfor@us.ibm.com


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 21:15                   ` Corey Ashford
@ 2009-04-06 21:21                     ` Peter Zijlstra
  2009-04-06 21:33                       ` Corey Ashford
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-06 21:21 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Mon, 2009-04-06 at 14:15 -0700, Corey Ashford wrote:
> 
> Peter Zijlstra wrote:
> > On Mon, 2009-04-06 at 13:16 -0700, Corey Ashford wrote:
> > 
> >>>> One downside of this approach is that you if you specify "no header" 
> >>>> (currently not possible, but maybe later?), you will not be able to get 
> >>>> the level bits.
> >>> Would this be desirable? 
> > 
> >> I think it would.  For one use case I'm working on right now, simple 
> >> profiling, all I need are ip's.  If I could omit the header, that would 
> >> reduce the frequency of sigio's by a factor of three, and make it faster 
> >> to read up the ip's when the SIGIO's occur.
> > 
> > Self-profiling?
> > 
> > So you're interested in getting the smallest possible record size, that
> > would still be 2 u64, right? Otherwise you don't get the IP context that
> > started this.
> > 
> > 
> 
> Self-profiling mainly, yes.  PAPI specs an ability for remote monitoring 
> of processes and threads, but I think it's only partially implemented.
> 
> So when you are talking about IP context, you mean pid/tid?

Ah, we called it level before, the hv/kernel/user thing. For remote
profiling you'd want to have the mmap thing too.




^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 21:21                     ` Peter Zijlstra
@ 2009-04-06 21:33                       ` Corey Ashford
  2009-04-07  7:11                         ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Corey Ashford @ 2009-04-06 21:33 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel



Peter Zijlstra wrote:
> On Mon, 2009-04-06 at 14:15 -0700, Corey Ashford wrote:
>> Peter Zijlstra wrote:
>>> On Mon, 2009-04-06 at 13:16 -0700, Corey Ashford wrote:
>>>
>>>>>> One downside of this approach is that you if you specify "no header" 
>>>>>> (currently not possible, but maybe later?), you will not be able to get 
>>>>>> the level bits.
>>>>> Would this be desirable? 
>>>> I think it would.  For one use case I'm working on right now, simple 
>>>> profiling, all I need are ip's.  If I could omit the header, that would 
>>>> reduce the frequency of sigio's by a factor of three, and make it faster 
>>>> to read up the ip's when the SIGIO's occur.
>>> Self-profiling?
>>>
>>> So you're interested in getting the smallest possible record size, that
>>> would still be 2 u64, right? Otherwise you don't get the IP context that
>>> started this.
>>>
>>>
>> Self-profiling mainly, yes.  PAPI specs an ability for remote monitoring 
>> of processes and threads, but I think it's only partially implemented.
>>
>> So when you are talking about IP context, you mean pid/tid?
> 
> Ah, we called it level before, the hv/kernel/user thing. For remote
> profiling you'd want to have the mmap thing too.

Oh I see.  In PAPI, the user specifies the range(s) of addresses he's 
interested in profiling (any sampled IP's outside the requested ranges 
are discarded), and so as long as the kernel space IP's don't overlap 
with user space IP's, we should be fine.

- Corey



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-06 21:33                       ` Corey Ashford
@ 2009-04-07  7:11                         ` Peter Zijlstra
  2009-04-07 16:27                           ` Corey Ashford
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2009-04-07  7:11 UTC (permalink / raw)
  To: Corey Ashford; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

On Mon, 2009-04-06 at 14:33 -0700, Corey Ashford wrote:
> 
> Peter Zijlstra wrote:
> > On Mon, 2009-04-06 at 14:15 -0700, Corey Ashford wrote:
> >> Peter Zijlstra wrote:
> >>> On Mon, 2009-04-06 at 13:16 -0700, Corey Ashford wrote:
> >>>
> >>>>>> One downside of this approach is that you if you specify "no header" 
> >>>>>> (currently not possible, but maybe later?), you will not be able to get 
> >>>>>> the level bits.
> >>>>> Would this be desirable? 
> >>>> I think it would.  For one use case I'm working on right now, simple 
> >>>> profiling, all I need are ip's.  If I could omit the header, that would 
> >>>> reduce the frequency of sigio's by a factor of three, and make it faster 
> >>>> to read up the ip's when the SIGIO's occur.
> >>> Self-profiling?
> >>>
> >>> So you're interested in getting the smallest possible record size, that
> >>> would still be 2 u64, right? Otherwise you don't get the IP context that
> >>> started this.
> >>>
> >>>
> >> Self-profiling mainly, yes.  PAPI specs an ability for remote monitoring 
> >> of processes and threads, but I think it's only partially implemented.
> >>
> >> So when you are talking about IP context, you mean pid/tid?
> > 
> > Ah, we called it level before, the hv/kernel/user thing. For remote
> > profiling you'd want to have the mmap thing too.
> 
> Oh I see.  In PAPI, the user specifies the range(s) of addresses he's 
> interested in profiling (any sampled IP's outside the requested ranges 
> are discarded), and so as long as the kernel space IP's don't overlap 
> with user space IP's, we should be fine.

Ah, while this would be true for most 'sane' architectures, Paul was
right in pointing out that this is not true for all architectures -- and
we should therefore not rely on address range alone.

You could of course use: hw_event.exclude_{hv,kernel} = 1 to ensure you
only get userspace thingies I suppose (but then you have no way of
telling how many you missed I guess).



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 5/6] perf_counter: add more context information
  2009-04-07  7:11                         ` Peter Zijlstra
@ 2009-04-07 16:27                           ` Corey Ashford
  0 siblings, 0 replies; 58+ messages in thread
From: Corey Ashford @ 2009-04-07 16:27 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Paul Mackerras, linux-kernel

Peter Zijlstra wrote:
> On Mon, 2009-04-06 at 14:33 -0700, Corey Ashford wrote:
>> Peter Zijlstra wrote:
>>> On Mon, 2009-04-06 at 14:15 -0700, Corey Ashford wrote:
>>>> Peter Zijlstra wrote:
>>>>> On Mon, 2009-04-06 at 13:16 -0700, Corey Ashford wrote:
>>>>>
>>>>>>>> One downside of this approach is that you if you specify "no header" 
>>>>>>>> (currently not possible, but maybe later?), you will not be able to get 
>>>>>>>> the level bits.
>>>>>>> Would this be desirable? 
>>>>>> I think it would.  For one use case I'm working on right now, simple 
>>>>>> profiling, all I need are ip's.  If I could omit the header, that would 
>>>>>> reduce the frequency of sigio's by a factor of three, and make it faster 
>>>>>> to read up the ip's when the SIGIO's occur.
>>>>> Self-profiling?
>>>>>
>>>>> So you're interested in getting the smallest possible record size, that
>>>>> would still be 2 u64, right? Otherwise you don't get the IP context that
>>>>> started this.
>>>>>
>>>>>
>>>> Self-profiling mainly, yes.  PAPI specs an ability for remote monitoring 
>>>> of processes and threads, but I think it's only partially implemented.
>>>>
>>>> So when you are talking about IP context, you mean pid/tid?
>>> Ah, we called it level before, the hv/kernel/user thing. For remote
>>> profiling you'd want to have the mmap thing too.
>> Oh I see.  In PAPI, the user specifies the range(s) of addresses he's 
>> interested in profiling (any sampled IP's outside the requested ranges 
>> are discarded), and so as long as the kernel space IP's don't overlap 
>> with user space IP's, we should be fine.
> 
> Ah, while this would be true for most 'sane' architectures, Paul was
> right in pointing out that this is not true for all architectures -- and
> we should therefore not rely on address range alone.
> 
> You could of course use: hw_event.exclude_{hv,kernel} = 1 to ensure you
> only get userspace thingies I suppose (but then you have no way of
> telling how many you missed I guess).

That's a good point.  PAPI's profiling API doesn't have a way for the 
caller to distinguish which address spaces (user/kernel/hv) he wants to 
profile.  It does have a way to designate which levels to ignore, but if 
you enable them all, you cannot specify the profiling address ranges 
pertaining to each.  That may be something I could propose adding to 
PAPI.  I suspect it would be pretty rarely used, though.

- Corey


^ permalink raw reply	[flat|nested] 58+ messages in thread

end of thread, other threads:[~2009-04-07 16:28 UTC | newest]

Thread overview: 58+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-04-02  9:11 [PATCH 0/6] more perf_counter stuff Peter Zijlstra
2009-04-02  9:11 ` [PATCH 1/6] perf_counter: move the event overflow output bits to record_type Peter Zijlstra
2009-04-02 11:28   ` Ingo Molnar
2009-04-02 11:43   ` Ingo Molnar
2009-04-02 11:47     ` Peter Zijlstra
2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
2009-04-02 22:33   ` [PATCH 1/6] " Corey Ashford
2009-04-02 23:27     ` Corey Ashford
2009-04-03  6:50       ` Peter Zijlstra
2009-04-03  7:30         ` Corey Ashford
2009-04-02  9:12 ` [PATCH 2/6] RFC perf_counter: singleshot support Peter Zijlstra
2009-04-02 10:51   ` Ingo Molnar
2009-04-02 11:48     ` Peter Zijlstra
2009-04-02 12:26       ` Ingo Molnar
2009-04-02 21:23         ` Paul Mackerras
2009-04-02 12:18   ` Peter Zijlstra
2009-04-02 18:10     ` Ingo Molnar
2009-04-02 18:33       ` Peter Zijlstra
2009-04-02  9:12 ` [PATCH 3/6] perf_counter: per event wakeups Peter Zijlstra
2009-04-02 11:32   ` Ingo Molnar
2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
2009-04-02  9:12 ` [PATCH 4/6] perf_counter: kerneltop: update to new ABI Peter Zijlstra
2009-04-02 12:03   ` [tip:perfcounters/core] " Peter Zijlstra
2009-04-02 13:35     ` Jaswinder Singh Rajput
2009-04-02 13:59       ` Jaswinder Singh Rajput
2009-04-02 18:11         ` Ingo Molnar
2009-04-02 18:22           ` Jaswinder Singh Rajput
2009-04-02 18:28             ` Ingo Molnar
2009-04-02 18:38               ` Jaswinder Singh Rajput
2009-04-02 19:20                 ` Ingo Molnar
2009-04-02 18:51               ` Jaswinder Singh Rajput
2009-04-02 18:32             ` Jaswinder Singh Rajput
2009-04-02  9:12 ` [PATCH 5/6] perf_counter: add more context information Peter Zijlstra
2009-04-02 11:36   ` Ingo Molnar
2009-04-02 11:46     ` Peter Zijlstra
2009-04-02 18:16       ` Ingo Molnar
2009-04-02 11:48     ` Peter Zijlstra
2009-04-02 18:18       ` Ingo Molnar
2009-04-02 18:29         ` Peter Zijlstra
2009-04-02 18:34           ` Ingo Molnar
2009-04-02 18:42             ` Peter Zijlstra
2009-04-02 19:19               ` Ingo Molnar
2009-04-02 12:04   ` [tip:perfcounters/core] " Peter Zijlstra
2009-04-03 12:50   ` [PATCH 5/6] " Peter Zijlstra
2009-04-03 18:25     ` Corey Ashford
2009-04-06 11:01       ` Peter Zijlstra
2009-04-06 11:07         ` Peter Zijlstra
2009-04-06 18:53           ` Corey Ashford
2009-04-06 19:06             ` Peter Zijlstra
2009-04-06 20:16               ` Corey Ashford
2009-04-06 20:46                 ` Peter Zijlstra
2009-04-06 21:15                   ` Corey Ashford
2009-04-06 21:21                     ` Peter Zijlstra
2009-04-06 21:33                       ` Corey Ashford
2009-04-07  7:11                         ` Peter Zijlstra
2009-04-07 16:27                           ` Corey Ashford
2009-04-02  9:12 ` [PATCH 6/6] perf_counter: update mmap() counter read Peter Zijlstra
2009-04-02 12:04   ` [tip:perfcounters/core] " Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox