[RFC] Sharing PMU counters across compatible events

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] Sharing PMU counters across compatible events
@ 2017-12-01 14:19 Tejun Heo
  2017-12-06 11:42 ` Jiri Olsa
  2017-12-06 12:35 ` Peter Zijlstra
  0 siblings, 2 replies; 9+ messages in thread
From: Tejun Heo @ 2017-12-01 14:19 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim
  Cc: linux-kernel, kernel-team

Hello,

There are cases where a single PMU event, let's say instructions, is
interesting for different subsets of the system.  For example, it
could be interesting to monitor instructions system-wide, at cgroup
level and per each thread.

This could easily be me not knowing better but I can't think of a way
to do the above without consuming multiple PMU counters to monitor
instructions, which quickly gets out of hand with the product of
multiple domains and counters.  Getting broken down numbers and adding
up doesn't work at cgroup (the numbers aren't mutually exclusive) or
thread level.

If there's a way to achieve this using the existing features, I'd be
glad to learn about it.  If not, the patch at the bottom is a crude
half-working proof-of-concept to share PMU counters across compatible
events.

In a nutshell, while adding events, it looks whether there already is
a compatible event.  If so, instead of enabling the new event, it gets
linked to the already enabled one (the master event) and count of the
dup event is updated by adding the delta of the master event.

That patch is just a proof of concept.  It's missing counter
propagation somewhere and the dup counts end up somewhat lower than
they should be.  The master selection and switching are half-broken.
Event compatibility testing is barely implemented, so on and so forth.
However, it does allow gathering events for different targets without
consuming multiple PMU counters.

What do you think?  Would this be something worth pursuing?

Thank you very much.

---
 include/linux/perf_event.h |    7 ++
 kernel/events/core.c       |  129 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 126 insertions(+), 10 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -583,6 +583,13 @@ struct perf_event {
 	local64_t			count;
 	atomic64_t			child_count;
 
+	struct list_head		active_event_entry;
+
+	struct perf_event		*dup_master;
+	struct list_head		dup_list;
+	u64				dup_base_count;
+	u64				dup_base_child_count;
+
 	/*
 	 * These are the total time in nanoseconds that the event
 	 * has been enabled (i.e. eligible to run, and the task has
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1119,6 +1119,7 @@ void perf_pmu_enable(struct pmu *pmu)
 }
 
 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
+static DEFINE_PER_CPU(struct list_head, active_event_list);
 
 /*
  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
@@ -1788,12 +1789,58 @@ event_filter_match(struct perf_event *ev
 	       perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
+static void event_sync_dup(struct perf_event *dup)
+{
+	struct perf_event *master = dup->dup_master;
+	u64 new_count = local64_read(&master->count);
+	u64 new_child_count = atomic64_read(&master->child_count);
+
+	local64_add(new_count - dup->dup_base_count, &dup->count);
+	atomic64_add(new_child_count - dup->dup_base_child_count,
+		     &dup->child_count);
+
+	dup->dup_base_count = new_count;
+	dup->dup_base_child_count = new_child_count;
+}
+
+static bool event_cleanup_dup(struct perf_event *event,
+			      struct perf_event **first_dupp)
+{
+	struct perf_event *first_dup = NULL;
+	struct perf_event *dup, *tmp;
+
+	if (event->dup_master) {
+		event_sync_dup(event);
+		list_del_init(&event->dup_list);
+		event->dup_master = NULL;
+		return true;
+	}
+
+	list_for_each_entry_safe(dup, tmp, &event->dup_list, dup_list) {
+		event_sync_dup(dup);
+
+		if (!first_dup) {
+			*first_dupp = first_dup = dup;
+			dup->dup_master = NULL;
+			list_del_init(&dup->dup_list);
+		} else {
+			dup->dup_master = first_dup;
+			list_move_tail(&dup->dup_list, &first_dup->dup_list);
+			dup->dup_base_count = local64_read(&first_dup->count);
+			dup->dup_base_child_count =
+				atomic64_read(&first_dup->child_count);
+		}
+	}
+	return false;
+}
+
 static void
 event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
 	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+	struct perf_event *first_dup = NULL;
 
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
@@ -1803,7 +1850,17 @@ event_sched_out(struct perf_event *event
 
 	perf_pmu_disable(event->pmu);
 
-	event->pmu->del(event, 0);
+	list_del_init(&event->active_event_entry);
+
+	if (!event_cleanup_dup(event, &first_dup)) {
+		event->pmu->del(event, 0);
+		/*
+		 * XXX: Should probably use a virtual master and avoid hot
+		 * swapping masters.
+		 */
+		if (first_dup)
+			WARN_ON_ONCE(event->pmu->add(first_dup, PERF_EF_START));
+	}
 	event->oncpu = -1;
 
 	if (event->pending_disable) {
@@ -2038,6 +2095,45 @@ static void perf_set_shadow_time(struct
 static void perf_log_throttle(struct perf_event *event, int enable);
 static void perf_log_itrace_start(struct perf_event *event);
 
+static bool event_setup_dup(struct perf_event *event,
+			    struct perf_cpu_context *cpuctx,
+			    struct perf_event_context *ctx)
+{
+	struct list_head *head = this_cpu_ptr(&active_event_list);
+	struct perf_event_attr *attr = &event->attr;
+	struct perf_event *tevent;
+
+	/* XXX: let's just do instructions for now */
+	if (!(attr->type == PERF_TYPE_HARDWARE &&
+	      attr->config == PERF_COUNT_HW_INSTRUCTIONS))
+		return false;
+
+	/* XXX: no group support yet either */
+	if (attr->read_format & PERF_FORMAT_GROUP)
+		return false;
+
+	list_for_each_entry(tevent, head, active_event_entry) {
+		struct perf_event_attr *tattr = &tevent->attr;
+
+		if (tevent->dup_master)
+			continue;
+
+		/* XXX: this definitely isn't enough */
+		if (attr->type == tattr->type && attr->config == tattr->config &&
+		    attr->freq == tattr->freq &&
+		    attr->sample_freq == tattr->sample_freq) {
+			event->dup_master = tevent;
+			list_add_tail(&event->dup_list, &tevent->dup_list);
+			event->dup_base_count = local64_read(&tevent->count);
+			event->dup_base_child_count =
+				atomic64_read(&tevent->child_count);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -2075,7 +2171,8 @@ event_sched_in(struct perf_event *event,
 
 	perf_log_itrace_start(event);
 
-	if (event->pmu->add(event, PERF_EF_START)) {
+	if (!event_setup_dup(event, cpuctx, ctx) &&
+	    event->pmu->add(event, PERF_EF_START)) {
 		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 		event->oncpu = -1;
 		ret = -EAGAIN;
@@ -2092,6 +2189,7 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
+	list_add_tail(&event->active_event_entry, this_cpu_ptr(&active_event_list));
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -2745,6 +2843,14 @@ static int context_equiv(struct perf_eve
 	return 0;
 }
 
+static void event_pmu_read(struct perf_event *event)
+{
+	if (!event->dup_master)
+		event->pmu->read(event);
+	else
+		event_sync_dup(event);
+}
+
 static void __perf_event_sync_stat(struct perf_event *event,
 				     struct perf_event *next_event)
 {
@@ -2761,7 +2867,7 @@ static void __perf_event_sync_stat(struc
 	 * don't need to use it.
 	 */
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		event->pmu->read(event);
+		event_pmu_read(event);
 
 	perf_event_update_time(event);
 
@@ -3528,14 +3634,14 @@ static void __perf_event_read(void *info
 		goto unlock;
 
 	if (!data->group) {
-		pmu->read(event);
+		event_pmu_read(event);
 		data->ret = 0;
 		goto unlock;
 	}
 
 	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 
-	pmu->read(event);
+	event_pmu_read(event);
 
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
@@ -3543,7 +3649,7 @@ static void __perf_event_read(void *info
 			 * Use sibling's PMU rather than @event's since
 			 * sibling could be on different (eg: software) PMU.
 			 */
-			sub->pmu->read(sub);
+			event_pmu_read(sub);
 		}
 	}
 
@@ -3607,7 +3713,7 @@ int perf_event_read_local(struct perf_ev
 	 * oncpu == -1).
 	 */
 	if (event->oncpu == smp_processor_id())
-		event->pmu->read(event);
+		event_pmu_read(event);
 
 	*value = local64_read(&event->count);
 	if (enabled || running) {
@@ -5718,7 +5824,7 @@ static void perf_output_read_group(struc
 		values[n++] = running;
 
 	if (leader != event)
-		leader->pmu->read(leader);
+		event_pmu_read(leader);
 
 	values[n++] = perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
@@ -5731,7 +5837,7 @@ static void perf_output_read_group(struc
 
 		if ((sub != event) &&
 		    (sub->state == PERF_EVENT_STATE_ACTIVE))
-			sub->pmu->read(sub);
+			event_pmu_read(sub);
 
 		values[n++] = perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
@@ -8555,7 +8661,7 @@ static enum hrtimer_restart perf_swevent
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return HRTIMER_NORESTART;
 
-	event->pmu->read(event);
+	event_pmu_read(event);
 
 	perf_sample_data_init(&data, 0, event->hw.last_period);
 	regs = get_irq_regs();
@@ -9383,6 +9489,8 @@ perf_event_alloc(struct perf_event_attr
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->active_event_entry);
+	INIT_LIST_HEAD(&event->dup_list);
 	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
@@ -10981,6 +11089,7 @@ static void __init perf_event_init_all_c
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+		INIT_LIST_HEAD(&per_cpu(active_event_list, cpu));
 
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-01 14:19 [RFC] Sharing PMU counters across compatible events Tejun Heo
@ 2017-12-06 11:42 ` Jiri Olsa
  2017-12-11 15:34   ` Tejun Heo
  2017-12-06 12:35 ` Peter Zijlstra
  1 sibling, 1 reply; 9+ messages in thread
From: Jiri Olsa @ 2017-12-06 11:42 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Namhyung Kim, linux-kernel, kernel-team

On Fri, Dec 01, 2017 at 06:19:50AM -0800, Tejun Heo wrote:
> Hello,
> 
> There are cases where a single PMU event, let's say instructions, is
> interesting for different subsets of the system.  For example, it
> could be interesting to monitor instructions system-wide, at cgroup
> level and per each thread.
> 
> This could easily be me not knowing better but I can't think of a way
> to do the above without consuming multiple PMU counters to monitor
> instructions, which quickly gets out of hand with the product of
> multiple domains and counters.  Getting broken down numbers and adding
> up doesn't work at cgroup (the numbers aren't mutually exclusive) or
> thread level.
> 
> If there's a way to achieve this using the existing features, I'd be
> glad to learn about it.  If not, the patch at the bottom is a crude
> half-working proof-of-concept to share PMU counters across compatible
> events.
> 
> In a nutshell, while adding events, it looks whether there already is
> a compatible event.  If so, instead of enabling the new event, it gets
> linked to the already enabled one (the master event) and count of the
> dup event is updated by adding the delta of the master event.
> 
> That patch is just a proof of concept.  It's missing counter
> propagation somewhere and the dup counts end up somewhat lower than
> they should be.  The master selection and switching are half-broken.
> Event compatibility testing is barely implemented, so on and so forth.
> However, it does allow gathering events for different targets without
> consuming multiple PMU counters.
> 
> What do you think?  Would this be something worth pursuing?

I see this rather on the hw level, since it concerns HW counters

I think we could detect same (alias) events at the time counters
are added/removed on/from the cpu and share their HW part like
counter idx, regs and such (struct hw_perf_event_cpu in my changes)

this way it'd be completely transparent for generic code

I made some untested (just compile) changes and attaching the
top patch, that has the main logic, please check all changes
in here:
  git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
  perf/alias_2

there's big change due to changing the perf_event::hw stuff
but the rest is quite simple (attached).. not completely sure
yet if I'm missing some functionality which would break due
to this change

thoughts?

thanks,
jirka


---
 arch/x86/events/core.c     | 58 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       |  1 +
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2bb5d1b2a622..737857f07881 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1189,6 +1189,54 @@ void x86_pmu_enable_event(struct perf_event *event)
 				       ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
+static bool is_alias(struct perf_event *alias, struct perf_event *event)
+{
+	if (is_sampling_event(alias))
+		return false;
+
+	return memcmp(&alias->attr, &event->attr, sizeof(alias->attr));
+}
+
+static int alias_add(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	struct perf_event *master;
+	int n;
+
+	for (n = 0; n < cpuc->n_events; n++) {
+		if (is_alias(event, cpuc->event_list[n]))
+			break;
+	}
+
+	if (n == cpuc->n_events)
+		return 0;
+
+	master = cpuc->event_list[n];
+	event->hw.cpu = master->hw.cpu;
+	list_add_tail(&event->hw.alias, &master->hw.master);
+	return 1;
+}
+
+static int alias_del(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	if (event->hw.cpu != &event->hw.cpu_local)
+		return 0;
+
+	event->hw.cpu = &event->hw.cpu_local;
+	list_del(&event->hw.alias);
+	return 1;
+}
+
+static void alias_update(struct perf_event *event)
+{
+	struct perf_event *alias;
+
+	list_for_each_entry(alias, &event->hw.master, hw.alias) {
+		alias->count          = event->count;
+		alias->hw.prev_count  = event->hw.prev_count;
+		alias->hw.period_left = event->hw.period_left;
+	}
+}
+
 /*
  * Add a single event to the PMU.
  *
@@ -1200,7 +1248,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc;
 	int assign[X86_PMC_IDX_MAX];
-	int n, n0, ret;
+	int n, n0, ret = 0;
+
+	if (alias_add(cpuc, event))
+		goto out;
 
 	hwc = &event->hw;
 
@@ -1383,11 +1434,16 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		goto do_del;
 
+	if (alias_del(cpuc, event))
+		goto do_del;
+
 	/*
 	 * Not a TXN, therefore cleanup properly.
 	 */
 	x86_pmu_stop(event, PERF_EF_UPDATE);
 
+	alias_update(event);
+
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event_list[i])
 			break;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b8bc5ddfbffd..6a09914a3555 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -136,6 +136,10 @@ struct hw_perf_event {
 		struct { /* hardware */
 			struct hw_perf_event_cpu *cpu;
 			struct hw_perf_event_cpu  cpu_local;
+			union {
+				struct list_head master;
+				struct list_head alias;
+			};
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5879c67d90e4..9cf36b2b533f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9513,6 +9513,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->active_event_entry);
 	INIT_LIST_HEAD(&event->dup_list);
 	INIT_LIST_HEAD(&event->addr_filters.list);
+	INIT_LIST_HEAD(&event->hw.master);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
 
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-01 14:19 [RFC] Sharing PMU counters across compatible events Tejun Heo
  2017-12-06 11:42 ` Jiri Olsa
@ 2017-12-06 12:35 ` Peter Zijlstra
  2017-12-11 15:47   ` Tejun Heo
  1 sibling, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2017-12-06 12:35 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, kernel-team,
	alexey.budankov

On Fri, Dec 01, 2017 at 06:19:50AM -0800, Tejun Heo wrote:

> What do you think?  Would this be something worth pursuing?

My worry with the whole thing is that it makes PMU scheduling _far_ more
expensive.

Currently HW PMU scheduling is 'bounded' by the fact that we have
bounded hardware resources (actually placing the events on these
resources is already very complex because not every event can go on
every counter).

We also stop trying to schedule HW events when we find we cannot place
more.

If we were to support this sharing thing (and you were correct in noting
that the specific conditions for matching events is going to be very
tricky indeed), both the above go out the window.

And this is already an area people are optimizing; see for example the
patches here:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/core

(which are WIP, but I got distracted with that whole KAISER stuff) they
make PMU scheduling up to 4x faster for some workload for not having to
iterate through the list finding events for the 'wrong' CPU.

With sharing we can no longer say the hardware is full, we done. But we
have to exhaustively try each and every event, just in case it can
share.

I realize we already have an unbounded case with software events; and we
really should do something about that, instead of making it worse :/

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-06 11:42 ` Jiri Olsa
@ 2017-12-11 15:34   ` Tejun Heo
  2017-12-13 10:18     ` Jiri Olsa
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2017-12-11 15:34 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Namhyung Kim, linux-kernel, kernel-team

Hello, Jiri.

On Wed, Dec 06, 2017 at 12:42:04PM +0100, Jiri Olsa wrote:
> I see this rather on the hw level, since it concerns HW counters
> 
> I think we could detect same (alias) events at the time counters
> are added/removed on/from the cpu and share their HW part like
> counter idx, regs and such (struct hw_perf_event_cpu in my changes)
> 
> this way it'd be completely transparent for generic code

I don't quite follow why doing this in arch code is better than
generic.  Doing this in arch means we'd need to do the same thing
multiple times for different archs.  Why is that better?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-06 12:35 ` Peter Zijlstra
@ 2017-12-11 15:47   ` Tejun Heo
  2017-12-12 22:37     ` Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2017-12-11 15:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, kernel-team,
	alexey.budankov

Hello, Peter.

On Wed, Dec 06, 2017 at 01:35:00PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 01, 2017 at 06:19:50AM -0800, Tejun Heo wrote:
> 
> > What do you think?  Would this be something worth pursuing?
> 
> My worry with the whole thing is that it makes PMU scheduling _far_ more
> expensive.
>
> Currently HW PMU scheduling is 'bounded' by the fact that we have
> bounded hardware resources (actually placing the events on these
> resources is already very complex because not every event can go on
> every counter).
>
> We also stop trying to schedule HW events when we find we cannot place
> more.
> 
> If we were to support this sharing thing (and you were correct in noting
> that the specific conditions for matching events is going to be very
> tricky indeed), both the above go out the window.

Understood, but I wonder whether something like this can be made
significantly cheaper and, hopefully, bound.  I could easily be
getting the details wrong, but it doesn't seem like we'd need to
compute much of these dynamically on context switch.

Let's say that we can pre-compute most of mergeable detections and the
value propagation can be pushed to the read time rather than event
time and thus that we can have the same functionality with
insiginficant hot path overhead.  Does that sound like something
acceptable to you?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-11 15:47   ` Tejun Heo
@ 2017-12-12 22:37     ` Peter Zijlstra
  2017-12-13 16:18       ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2017-12-12 22:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, kernel-team,
	alexey.budankov

On Mon, Dec 11, 2017 at 07:47:44AM -0800, Tejun Heo wrote:
> Hello, Peter.
> 
> On Wed, Dec 06, 2017 at 01:35:00PM +0100, Peter Zijlstra wrote:
> > On Fri, Dec 01, 2017 at 06:19:50AM -0800, Tejun Heo wrote:
> > 
> > > What do you think?  Would this be something worth pursuing?
> > 
> > My worry with the whole thing is that it makes PMU scheduling _far_ more
> > expensive.
> >
> > Currently HW PMU scheduling is 'bounded' by the fact that we have
> > bounded hardware resources (actually placing the events on these
> > resources is already very complex because not every event can go on
> > every counter).
> >
> > We also stop trying to schedule HW events when we find we cannot place
> > more.
> > 
> > If we were to support this sharing thing (and you were correct in noting
> > that the specific conditions for matching events is going to be very
> > tricky indeed), both the above go out the window.
> 
> Understood, but I wonder whether something like this can be made
> significantly cheaper and, hopefully, bound.  I could easily be
> getting the details wrong, but it doesn't seem like we'd need to
> compute much of these dynamically on context switch.
> 
> Let's say that we can pre-compute most of mergeable detections and the
> value propagation can be pushed to the read time rather than event
> time and thus that we can have the same functionality with
> insiginficant hot path overhead.  Does that sound like something
> acceptable to you?

That would be a fairly massive change from how perf works today. And the
obvious pain point would be changing the per-cpu event set, which would
mean recomputing all possible combinations of task sets.

Also note that each context (cpu,task) is allowed to have more events
than fit on the PMU, at which point we'll start rotating events. Do we
also pre-compute all possible rotation sets?

Just not quite seeing this..

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-11 15:34   ` Tejun Heo
@ 2017-12-13 10:18     ` Jiri Olsa
  2017-12-13 16:15       ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Jiri Olsa @ 2017-12-13 10:18 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Namhyung Kim, linux-kernel, kernel-team

On Mon, Dec 11, 2017 at 07:34:49AM -0800, Tejun Heo wrote:
> Hello, Jiri.
> 
> On Wed, Dec 06, 2017 at 12:42:04PM +0100, Jiri Olsa wrote:
> > I see this rather on the hw level, since it concerns HW counters
> > 
> > I think we could detect same (alias) events at the time counters
> > are added/removed on/from the cpu and share their HW part like
> > counter idx, regs and such (struct hw_perf_event_cpu in my changes)
> > 
> > this way it'd be completely transparent for generic code
> 
> I don't quite follow why doing this in arch code is better than
> generic.  Doing this in arch means we'd need to do the same thing
> multiple times for different archs.  Why is that better?

so I can see this to be useful for HW conters only, because
of limited number of regs

as for the higher level on which this could be implemented I
see some pitfals with event rotations as Peter mentioned and
task/cpu contexts scheduling.. while the hw-level implementation
seems pretty straight forward

I'll test the code and let's see ;-)

jirka

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-13 10:18     ` Jiri Olsa
@ 2017-12-13 16:15       ` Tejun Heo
  0 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2017-12-13 16:15 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Namhyung Kim, linux-kernel, kernel-team

Hello, Jiri.

On Wed, Dec 13, 2017 at 11:18:07AM +0100, Jiri Olsa wrote:
> so I can see this to be useful for HW conters only, because
> of limited number of regs
> 
> as for the higher level on which this could be implemented I
> see some pitfals with event rotations as Peter mentioned and
> task/cpu contexts scheduling.. while the hw-level implementation
> seems pretty straight forward

Heh, yeah, maybe.  

> I'll test the code and let's see ;-)

Thanks.  I'll think more about it and see whether this can be evolved
into something useable.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] Sharing PMU counters across compatible events
  2017-12-12 22:37     ` Peter Zijlstra
@ 2017-12-13 16:18       ` Tejun Heo
  0 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2017-12-13 16:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, kernel-team,
	alexey.budankov

Hello, Peter.

On Tue, Dec 12, 2017 at 11:37:36PM +0100, Peter Zijlstra wrote:
> That would be a fairly massive change from how perf works today. And the
> obvious pain point would be changing the per-cpu event set, which would
> mean recomputing all possible combinations of task sets.
> 
> Also note that each context (cpu,task) is allowed to have more events
> than fit on the PMU, at which point we'll start rotating events. Do we
> also pre-compute all possible rotation sets?
> 
> Just not quite seeing this..

Yeah, right.  Will think more about it

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-12-13 16:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-01 14:19 [RFC] Sharing PMU counters across compatible events Tejun Heo
2017-12-06 11:42 ` Jiri Olsa
2017-12-11 15:34   ` Tejun Heo
2017-12-13 10:18     ` Jiri Olsa
2017-12-13 16:15       ` Tejun Heo
2017-12-06 12:35 ` Peter Zijlstra
2017-12-11 15:47   ` Tejun Heo
2017-12-12 22:37     ` Peter Zijlstra
2017-12-13 16:18       ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox