From: Stephane Eranian <eranian@google.com>
To: linux-kernel@vger.kernel.org
Cc: peterz@infradead.org, mingo@elte.hu, ak@linux.intel.com,
acme@redhat.com, jolsa@redhat.com, namhyung.kim@lge.com
Subject: [PATCH v5 1/2] perf: use hrtimer for event multiplexing
Date: Fri, 22 Mar 2013 11:51:38 +0100 [thread overview]
Message-ID: <1363949499-3728-2-git-send-email-eranian@google.com> (raw)
In-Reply-To: <1363949499-3728-1-git-send-email-eranian@google.com>
The current scheme of using the timer tick was fine
for per-thread events. However, it was causing
bias issues in system-wide mode (including for
uncore PMUs). Event groups would not get their
fair share of runtime on the PMU. With tickless
kernels, if a core is idle there is no timer tick,
and thus no event rotation (multiplexing). However,
there are events (especially uncore events) which do
count even though cores are asleep.
This patch changes the timer source for multiplexing.
It introduces a per-PMU per-cpu hrtimer. The advantage
is that even when a core goes idle, it will come back
to service the hrtimer, thus multiplexing on system-wide
events works much better.
The per-PMU implementation (suggested by PeterZ) enables
adjusting the multiplexing interval per PMU. The preferred
interval is stashed into the struct pmu. If not set, it
will be forced to the default interval value.
In order to minimize the impact of the hrtimer, it
is turned on and off on demand. When the PMU on
a CPU is overcommited, the hrtimer is activated.
It is stopped when the PMU is not overcommitted.
In order for this to work properly, we had to change
the order of initialization in start_kernel() such
that hrtimer_init() is run before perf_event_init().
The default interval in milliseconds is set to a
timer tick just like with the old code. We will
provide a sysctl to tune this in another patch.
Signed-off-by: Stephane Eranian <eranian@google.com>
---
include/linux/perf_event.h | 3 +-
init/main.c | 2 +-
kernel/events/core.c | 114 ++++++++++++++++++++++++++++++++++++++++----
3 files changed, 109 insertions(+), 10 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 12a1aa2..e2e6c7e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -500,8 +500,9 @@ struct perf_cpu_context {
struct perf_event_context *task_ctx;
int active_oncpu;
int exclusive;
+ struct hrtimer hrtimer;
+ ktime_t hrtimer_interval;
struct list_head rotation_list;
- int jiffies_interval;
struct pmu *unique_pmu;
struct perf_cgroup *cgrp;
};
diff --git a/init/main.c b/init/main.c
index b3e0614..1542b38 100644
--- a/init/main.c
+++ b/init/main.c
@@ -544,7 +544,6 @@ asmlinkage void __init start_kernel(void)
local_irq_disable();
}
idr_init_cache();
- perf_event_init();
rcu_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
@@ -556,6 +555,7 @@ asmlinkage void __init start_kernel(void)
softirq_init();
timekeeping_init();
time_init();
+ perf_event_init();
profile_init();
call_function_init();
if (!irqs_disabled())
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b4a55d..92d34ad 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -169,6 +169,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
static int max_samples_per_tick __read_mostly =
DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -642,6 +644,98 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
#endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+ struct perf_cpu_context *cpuctx;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ unsigned long flags;
+ int rotations = 0;
+
+ cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+ local_irq_save(flags);
+
+ rotations = perf_rotate_context(cpuctx);
+
+ local_irq_restore(flags);
+
+ /*
+ * arm timer if needed
+ */
+ if (rotations) {
+ hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ ret = HRTIMER_RESTART;
+ }
+
+ return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ continue;
+
+ hrtimer_cancel(&cpuctx->hrtimer);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* no multiplexing needed for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ cpuctx->hrtimer_interval =
+ ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER);
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* not for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ if (hrtimer_active(hr))
+ return;
+
+ if (!hrtimer_callback_running(hr))
+ __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+ 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1480,6 +1574,7 @@ group_sched_in(struct perf_event *group_event,
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -1526,6 +1621,8 @@ group_sched_in(struct perf_event *group_event,
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
+
return -EAGAIN;
}
@@ -1781,8 +1878,10 @@ static int __perf_event_enable(void *info)
* If this event can't go on and it's part of a
* group, then the whole group has to come off.
*/
- if (leader != event)
+ if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
+ perf_cpu_hrtimer_restart(cpuctx);
+ }
if (leader->attr.pinned) {
update_group_times(leader);
leader->state = PERF_EVENT_STATE_ERROR;
@@ -2529,7 +2628,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
* because they're strictly cpu affine and rotate_start is called with IRQs
* disabled, while rotate_context is called from IRQ context.
*/
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
int rotate = 0, remove = 1;
@@ -2568,6 +2667,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
+
+ return rotate;
}
void perf_event_task_tick(void)
@@ -2589,10 +2690,6 @@ void perf_event_task_tick(void)
ctx = cpuctx->task_ctx;
if (ctx)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- if (cpuctx->jiffies_interval == 1 ||
- !(jiffies % cpuctx->jiffies_interval))
- perf_rotate_context(cpuctx);
}
}
@@ -6014,7 +6111,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
- cpuctx->jiffies_interval = 1;
+
+ __perf_cpu_hrtimer_init(cpuctx, cpu);
+
INIT_LIST_HEAD(&cpuctx->rotation_list);
cpuctx->unique_pmu = pmu;
}
@@ -7400,7 +7499,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
-
default:
break;
}
--
1.7.9.5
next prev parent reply other threads:[~2013-03-22 10:51 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-22 10:51 [PATCH v5 0/2] perf: use hrtimer for event multiplexing Stephane Eranian
2013-03-22 10:51 ` Stephane Eranian [this message]
2013-03-25 11:13 ` [PATCH v5 1/2] " Peter Zijlstra
2013-03-25 17:58 ` Stephane Eranian
2013-03-22 10:51 ` [PATCH v5 2/2] perf: add sysfs entry to adjust multiplexing interval per PMU Stephane Eranian
2013-03-25 11:43 ` Peter Zijlstra
2013-03-25 12:15 ` Stephane Eranian
2013-03-22 13:54 ` [PATCH v5 0/2] perf: use hrtimer for event multiplexing Frederic Weisbecker
2013-03-25 12:08 ` Peter Zijlstra
2013-03-25 12:21 ` Frederic Weisbecker
2013-03-22 17:33 ` Andi Kleen
2013-03-22 17:36 ` Stephane Eranian
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1363949499-3728-2-git-send-email-eranian@google.com \
--to=eranian@google.com \
--cc=acme@redhat.com \
--cc=ak@linux.intel.com \
--cc=jolsa@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=namhyung.kim@lge.com \
--cc=peterz@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox