linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: tip-bot for Stephane Eranian <eranian@google.com>
To: linux-tip-commits@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, eranian@google.com, hpa@zytor.com,
	mingo@redhat.com, a.p.zijlstra@chello.nl, tglx@linutronix.de,
	mingo@elte.hu
Subject: [tip:perf/hw-branch-sampling] perf: Add callback to flush branch_stack on context switch
Date: Fri, 9 Mar 2012 05:26:49 -0800	[thread overview]
Message-ID: <tip-d010b3326cf06b3406cdd88af16dcf4e4b6fec2e@git.kernel.org> (raw)
In-Reply-To: <1328826068-11713-11-git-send-email-eranian@google.com>

Commit-ID:  d010b3326cf06b3406cdd88af16dcf4e4b6fec2e
Gitweb:     http://git.kernel.org/tip/d010b3326cf06b3406cdd88af16dcf4e4b6fec2e
Author:     Stephane Eranian <eranian@google.com>
AuthorDate: Thu, 9 Feb 2012 23:21:00 +0100
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 5 Mar 2012 14:55:42 +0100

perf: Add callback to flush branch_stack on context switch

With branch stack sampling, it is possible to filter by priv levels.

In system-wide mode, that means it is possible to capture only user
level branches. The builtin SW LBR filter needs to disassemble code
based on LBR captured addresses. For that, it needs to know the task
the addresses are associated with. Because of context switches, the
content of the branch stack buffer may contain addresses from
different tasks.

We need a callback on context switch to either flush the branch stack
or save it. This patch adds a new callback in struct pmu which is called
during context switches. The callback is called only when necessary.
That is when a system-wide context has, at least, one event which
uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for
per-thread context.

In this version, the Intel x86 code simply flushes (resets) the LBR
on context switches (fills it with zeroes). Those zeroed branches are
then filtered out by the SW filter.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |   21 +++++---
 arch/x86/kernel/cpu/perf_event.h       |    1 +
 arch/x86/kernel/cpu/perf_event_intel.c |   13 +++++
 include/linux/perf_event.h             |    9 +++-
 kernel/events/core.c                   |   85 ++++++++++++++++++++++++++++++++
 5 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cea5674..0a18d16 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
+static void x86_pmu_flush_branch_stack(void)
+{
+	if (x86_pmu.flush_branch_stack)
+		x86_pmu.flush_branch_stack();
+}
+
 static struct pmu pmu = {
-	.pmu_enable	= x86_pmu_enable,
-	.pmu_disable	= x86_pmu_disable,
+	.pmu_enable		= x86_pmu_enable,
+	.pmu_disable		= x86_pmu_disable,
 
 	.attr_groups	= x86_pmu_attr_groups,
 
 	.event_init	= x86_pmu_event_init,
 
-	.add		= x86_pmu_add,
-	.del		= x86_pmu_del,
-	.start		= x86_pmu_start,
-	.stop		= x86_pmu_stop,
-	.read		= x86_pmu_read,
+	.add			= x86_pmu_add,
+	.del			= x86_pmu_del,
+	.start			= x86_pmu_start,
+	.stop			= x86_pmu_stop,
+	.read			= x86_pmu_read,
 
 	.start_txn	= x86_pmu_start_txn,
 	.cancel_txn	= x86_pmu_cancel_txn,
 	.commit_txn	= x86_pmu_commit_txn,
 
 	.event_idx	= x86_pmu_event_idx,
+	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 };
 
 void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f104c05..74387c1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -324,6 +324,7 @@ struct x86_pmu {
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
 	void		(*cpu_dead)(int cpu);
+	void		(*flush_branch_stack)(void);
 
 	/*
 	 * Intel Arch Perfmon v2+
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7cc1e2d..6627089 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
+static void intel_pmu_flush_branch_stack(void)
+{
+	/*
+	 * Intel LBR does not tag entries with the
+	 * PID of the current task, then we need to
+	 * flush it on ctxsw
+	 * For now, we simply reset it
+	 */
+	if (x86_pmu.lbr_nr)
+		intel_pmu_lbr_reset();
+}
+
 static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
+	.flush_branch_stack	= intel_pmu_flush_branch_stack,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5fc494f..fbbf5e5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -746,6 +746,11 @@ struct pmu {
 	 * if no implementation is provided it will default to: event->hw.idx + 1.
 	 */
 	int (*event_idx)		(struct perf_event *event); /*optional */
+
+	/*
+	 * flush branch stack on context-switches (needed in cpu-wide mode)
+	 */
+	void (*flush_branch_stack)	(void);
 };
 
 /**
@@ -979,7 +984,8 @@ struct perf_event_context {
 	u64				parent_gen;
 	u64				generation;
 	int				pin_count;
-	int				nr_cgroups; /* cgroup events present */
+	int				nr_cgroups;	 /* cgroup evts */
+	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
 };
 
@@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
+
 struct perf_sample_data {
 	u64				type;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 242bb51..c61234b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -137,6 +137,7 @@ enum event_type_t {
  */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
 
+	if (has_branch_stack(event))
+		ctx->nr_branch_stack++;
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
 		perf_pmu_rotate_start(ctx->pmu);
@@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 			cpuctx->cgrp = NULL;
 	}
 
+	if (has_branch_stack(event))
+		ctx->nr_branch_stack--;
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -2202,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 }
 
 /*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+				       struct task_struct *task)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	/* no need to flush branch stack if not changing task */
+	if (prev == task)
+		return;
+
+	local_irq_save(flags);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		/*
+		 * check if the context has at least one
+		 * event using PERF_SAMPLE_BRANCH_STACK
+		 */
+		if (cpuctx->ctx.nr_branch_stack > 0
+		    && pmu->flush_branch_stack) {
+
+			pmu = cpuctx->ctx.pmu;
+
+			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+			perf_pmu_disable(pmu);
+
+			pmu->flush_branch_stack();
+
+			perf_pmu_enable(pmu);
+
+			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		}
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
+/*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
  *
@@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	 */
 	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
+
+	/* check for system-wide branch_stack events */
+	if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+		perf_branch_stack_sched_in(prev, task);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
 			static_key_slow_dec_deferred(&perf_sched_events);
 		}
+
+		if (has_branch_stack(event)) {
+			static_key_slow_dec_deferred(&perf_sched_events);
+			/* is system-wide event */
+			if (!(event->attach_state & PERF_ATTACH_TASK))
+				atomic_dec(&per_cpu(perf_branch_stack_events,
+						    event->cpu));
+		}
 	}
 
 	if (event->rb) {
@@ -5924,6 +6003,12 @@ done:
 				return ERR_PTR(err);
 			}
 		}
+		if (has_branch_stack(event)) {
+			static_key_slow_inc(&perf_sched_events.key);
+			if (!(event->attach_state & PERF_ATTACH_TASK))
+				atomic_inc(&per_cpu(perf_branch_stack_events,
+						    event->cpu));
+		}
 	}
 
 	return event;

  reply	other threads:[~2012-03-09 13:27 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-09 22:20 [PATCH v6 00/18] perf: add support for sampling taken branches Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 01/18] perf: add generic taken branch sampling support Stephane Eranian
2012-03-09 13:19   ` [tip:perf/hw-branch-sampling] perf: Add " tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 02/18] perf: add Intel LBR MSR definitions Stephane Eranian
2012-03-09 13:20   ` [tip:perf/hw-branch-sampling] perf/x86: Add " tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 03/18] perf: add Intel X86 LBR sharing logic Stephane Eranian
2012-03-09 13:21   ` [tip:perf/hw-branch-sampling] perf/x86: Add Intel " tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 04/18] perf: sync branch stack sampling with X86 precise_sampling Stephane Eranian
2012-03-09 13:21   ` [tip:perf/hw-branch-sampling] perf/x86: Sync branch stack sampling with precise_sampling tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 05/18] perf: add Intel X86 LBR mappings for PERF_SAMPLE_BRANCH filters Stephane Eranian
2012-03-09 13:22   ` [tip:perf/hw-branch-sampling] perf/x86: Add Intel " tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 06/18] perf: disable LBR support for older Intel Atom processors Stephane Eranian
2012-03-09 13:23   ` [tip:perf/hw-branch-sampling] perf/x86: Disable " tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 07/18] perf: implement PERF_SAMPLE_BRANCH for Intel X86 Stephane Eranian
2012-03-09 13:24   ` [tip:perf/hw-branch-sampling] perf/x86: Implement PERF_SAMPLE_BRANCH for Intel CPUs tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 08/18] perf: add LBR software filter support for Intel X86 Stephane Eranian
2012-03-09 13:25   ` [tip:perf/hw-branch-sampling] perf/x86: Add LBR software filter support for Intel CPUs tip-bot for Stephane Eranian
2012-02-09 22:20 ` [PATCH v6 09/18] perf: disable PERF_SAMPLE_BRANCH_* when not supported Stephane Eranian
2012-03-09 13:26   ` [tip:perf/hw-branch-sampling] perf: Disable " tip-bot for Stephane Eranian
2012-02-09 22:21 ` [PATCH v6 10/18] perf: add hook to flush branch_stack on context switch Stephane Eranian
2012-03-09 13:26   ` tip-bot for Stephane Eranian [this message]
2012-02-09 22:21 ` [PATCH v6 11/18] perf: add code to support PERF_SAMPLE_BRANCH_STACK Stephane Eranian
2012-03-09 13:27   ` [tip:perf/hw-branch-sampling] perf tools: Add " tip-bot for Roberto Agostino Vitillo
2012-02-09 22:21 ` [PATCH v6 12/18] perf: add support for sampling taken branch to perf record Stephane Eranian
2012-03-09 13:28   ` [tip:perf/hw-branch-sampling] perf record: Add support for sampling taken branch tip-bot for Roberto Agostino Vitillo
2012-02-09 22:21 ` [PATCH v6 13/18] perf: add support for taken branch sampling to perf report Stephane Eranian
2012-02-14 21:47   ` Arnaldo Carvalho de Melo
2012-02-22 11:15   ` Ingo Molnar
2012-02-22 16:18     ` Stephane Eranian
2012-02-23  9:59       ` Ingo Molnar
2012-02-23 16:53         ` Stephane Eranian
2012-02-23 18:11           ` Arnaldo Carvalho de Melo
2012-02-23 20:48             ` Stephane Eranian
2012-03-09 13:29   ` [tip:perf/hw-branch-sampling] perf report: Add support for taken branch sampling tip-bot for Roberto Agostino Vitillo
2012-02-09 22:21 ` [PATCH v6 14/18] perf: fix endianness detection in perf.data Stephane Eranian
2012-02-09 22:32   ` David Ahern
2012-02-09 22:21 ` [PATCH v6 15/18] perf: add ABI reference sizes Stephane Eranian
2012-03-09 13:30   ` [tip:perf/hw-branch-sampling] perf: Add " tip-bot for Stephane Eranian
2012-02-09 22:21 ` [PATCH v6 16/18] perf: enable reading of perf.data files from different ABI rev Stephane Eranian
2012-02-09 22:39   ` David Ahern
2012-02-09 22:41     ` Stephane Eranian
2012-02-09 22:46       ` David Ahern
2012-02-09 22:49         ` Stephane Eranian
2012-02-10  0:37   ` David Ahern
2012-03-09 13:30   ` [tip:perf/hw-branch-sampling] perf tools: Enable reading of perf. data " tip-bot for Stephane Eranian
2012-02-09 22:21 ` [PATCH v6 17/18] perf: fix bug print_event_desc() Stephane Eranian
2012-03-09 13:31   ` [tip:perf/hw-branch-sampling] perf tools: Fix ABI compatibility bug in print_event_desc() tip-bot for Stephane Eranian
2012-02-09 22:21 ` [PATCH v6 18/18] perf: make perf able to read file from older ABIs Stephane Eranian
2012-03-09 13:32   ` [tip:perf/hw-branch-sampling] perf tools: Make perf able to read files " tip-bot for Stephane Eranian
2012-02-27  7:50 ` [PATCH v6 00/18] perf: add support for sampling taken branches Anshuman Khandual
2012-02-27  8:45   ` Stephane Eranian

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=tip-d010b3326cf06b3406cdd88af16dcf4e4b6fec2e@git.kernel.org \
    --to=eranian@google.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-tip-commits@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).