public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: David Carrillo-Cisneros <davidcc@google.com>
To: linux-kernel@vger.kernel.org
Cc: "x86@kernel.org" <x86@kernel.org>, Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andi Kleen <ak@linux.intel.com>, Kan Liang <kan.liang@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Borislav Petkov <bp@suse.de>,
	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Vikas Shivappa <vikas.shivappa@linux.intel.com>,
	Mark Rutland <mark.rutland@arm.com>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Vince Weaver <vince@deater.net>, Paul Turner <pjt@google.com>,
	Stephane Eranian <eranian@google.com>,
	David Carrillo-Cisneros <davidcc@google.com>
Subject: [RFC 1/6] perf/core: create active and inactive event groups
Date: Tue, 10 Jan 2017 02:24:57 -0800	[thread overview]
Message-ID: <20170110102502.106187-2-davidcc@google.com> (raw)
In-Reply-To: <20170110102502.106187-1-davidcc@google.com>

Currently, perf uses pinned_groups and flexible_groups for sched in/out.
We can do better because:
  - sched out only cares about the ACTIVE events, this is usually a small
  set of events.
  - There can be many events in these lists thate are no relevant to
  the scheduler (e.g. other CPU/cgroups, events in OFF and ERROR state).

Reduce the set of events to iterate over each context switch by adding
three new lists: active_pinned_groups, active_flexible_groups and
inactive_groups. All events in each list are in the same state so we
avoid checking state. It also saves the iteration over events in OFF and
ERROR state during sched in/out.

The main impact of this patch is that ctx_sched_out can use the "small"
active_{pinned,flexible}_groups instead of the potentially much larger
{pinned,flexible}_groups.

There is no pinned/flexible version of inactive event because next patches
will create an index on them.

Bookkeeping of the new lists is more involved, but it can provide a
potentially large speed up. The new lists are intended to eventually
replace {pinned,flexible}_groups, although that's not yet implemented.

The inactive list is kept in FIFO order and only added after
group_sched_in has succeded. This guarantees that it's in timestamp
order.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
---
 include/linux/perf_event.h |  6 +++
 kernel/events/core.c       | 93 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..3fa18f05c9b0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -573,6 +573,7 @@ struct perf_event {
 
 	struct hlist_node		hlist_entry;
 	struct list_head		active_entry;
+	struct list_head		ctx_active_entry;
 	int				nr_siblings;
 
 	/* Not serialized. Only written during event initialization. */
@@ -734,6 +735,11 @@ struct perf_event_context {
 	struct list_head		active_ctx_list;
 	struct list_head		pinned_groups;
 	struct list_head		flexible_groups;
+
+	struct list_head		active_pinned_groups;
+	struct list_head		active_flexible_groups;
+	struct list_head		inactive_groups;
+
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index faf073d0287f..b744b5a8dbd0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1462,6 +1462,21 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 		return &ctx->flexible_groups;
 }
 
+static void
+ctx_sched_groups_to_inactive(struct perf_event *event,
+			     struct perf_event_context *ctx)
+{
+	WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE);
+	list_move_tail(&event->ctx_active_entry, &ctx->inactive_groups);
+};
+
+static void
+ctx_sched_groups_add(struct perf_event *event, struct perf_event_context *ctx)
+{
+	WARN_ON(!list_empty(&event->ctx_active_entry));
+	list_add_tail(&event->ctx_active_entry, &ctx->inactive_groups);
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1487,10 +1502,11 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
 		list = ctx_group_list(event, ctx);
 		list_add_tail(&event->group_entry, list);
+		if (event->state == PERF_EVENT_STATE_INACTIVE)
+			ctx_sched_groups_add(event, ctx);
 	}
 
 	list_update_cgroup_event(event, ctx, true);
-
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
@@ -1648,6 +1664,13 @@ static void perf_group_attach(struct perf_event *event)
 		perf_event__header_size(pos);
 }
 
+static void ctx_sched_groups_del(struct perf_event *group,
+				 struct perf_event_context *ctx)
+{
+	WARN_ON(group->state != PERF_EVENT_STATE_INACTIVE);
+	list_del_init(&group->ctx_active_entry);
+}
+
 /*
  * Remove a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1674,8 +1697,11 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	list_del_rcu(&event->event_entry);
 
-	if (event->group_leader == event)
+	if (event->group_leader == event) {
+		if (event->state == PERF_EVENT_STATE_INACTIVE)
+			ctx_sched_groups_del(event, ctx);
 		list_del_init(&event->group_entry);
+	}
 
 	update_group_times(event);
 
@@ -1851,6 +1877,11 @@ group_sched_out(struct perf_event *group_event,
 
 	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
 		cpuctx->exclusive = 0;
+
+	if (group_event->state <= PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_to_inactive(group_event, ctx);
+	if (group_event->state < PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_del(group_event, ctx);
 }
 
 #define DETACH_GROUP	0x01UL
@@ -1918,6 +1949,8 @@ static void __perf_event_disable(struct perf_event *event,
 		group_sched_out(event, cpuctx, ctx);
 	else
 		event_sched_out(event, cpuctx, ctx);
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_del(event, ctx);
 	event->state = PERF_EVENT_STATE_OFF;
 }
 
@@ -2014,6 +2047,17 @@ static void perf_set_shadow_time(struct perf_event *event,
 static void perf_log_throttle(struct perf_event *event, int enable);
 static void perf_log_itrace_start(struct perf_event *event);
 
+static void
+ctx_sched_groups_to_active(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct list_head *h = event->attr.pinned ? &ctx->active_pinned_groups :
+						   &ctx->active_flexible_groups;
+	WARN_ON(!event);
+	WARN_ON(list_empty(&event->ctx_active_entry));
+	WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
+	list_move_tail(&event->ctx_active_entry, h);
+}
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -2091,9 +2135,7 @@ group_sched_in(struct perf_event *group_event,
 	u64 now = ctx->time;
 	bool simulate = false;
 
-	if (group_event->state == PERF_EVENT_STATE_OFF)
-		return 0;
-
+	WARN_ON(group_event->state != PERF_EVENT_STATE_INACTIVE);
 	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
@@ -2112,9 +2154,10 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!pmu->commit_txn(pmu))
+	if (!pmu->commit_txn(pmu)) {
+		ctx_sched_groups_to_active(group_event, ctx);
 		return 0;
-
+	}
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
@@ -2396,6 +2439,7 @@ static void __perf_event_enable(struct perf_event *event,
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 
 	__perf_event_mark_enabled(event);
+	ctx_sched_groups_add(event, ctx);
 
 	if (!ctx->is_active)
 		return;
@@ -2611,7 +2655,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			  enum event_type_t event_type)
 {
 	int is_active = ctx->is_active;
-	struct perf_event *event;
+	struct perf_event *event, *tmp;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -2658,13 +2702,17 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+		list_for_each_entry_safe(event, tmp, &ctx->active_pinned_groups, ctx_active_entry) {
+			WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
 			group_sched_out(event, cpuctx, ctx);
+		}
 	}
 
 	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+		list_for_each_entry_safe(event, tmp, &ctx->active_flexible_groups, ctx_active_entry) {
+			WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
 			group_sched_out(event, cpuctx, ctx);
+		}
 	}
 	perf_pmu_enable(ctx->pmu);
 }
@@ -2962,10 +3010,11 @@ static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
 		    struct perf_cpu_context *cpuctx)
 {
-	struct perf_event *event;
+	struct perf_event *event = NULL, *tmp;
 
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
+	list_for_each_entry_safe(
+			event, tmp, &ctx->inactive_groups, ctx_active_entry) {
+		if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
 			continue;
 		if (!event_filter_match(event))
 			continue;
@@ -2983,6 +3032,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		 */
 		if (event->state == PERF_EVENT_STATE_INACTIVE) {
 			update_group_times(event);
+			ctx_sched_groups_del(event, ctx);
 			event->state = PERF_EVENT_STATE_ERROR;
 		}
 	}
@@ -2992,12 +3042,12 @@ static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
 		      struct perf_cpu_context *cpuctx)
 {
-	struct perf_event *event;
+	struct perf_event *event = NULL, *tmp;
 	int can_add_hw = 1;
 
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
+	list_for_each_entry_safe(
+			event, tmp, &ctx->inactive_groups, ctx_active_entry) {
+		if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
 			continue;
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
@@ -3389,6 +3439,7 @@ static int event_enable_on_exec(struct perf_event *event,
 		return 0;
 
 	__perf_event_mark_enabled(event);
+	ctx_sched_groups_add(event, ctx);
 
 	return 1;
 }
@@ -3639,6 +3690,9 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	INIT_LIST_HEAD(&ctx->pinned_groups);
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
+	INIT_LIST_HEAD(&ctx->active_pinned_groups);
+	INIT_LIST_HEAD(&ctx->active_flexible_groups);
+	INIT_LIST_HEAD(&ctx->inactive_groups);
 	atomic_set(&ctx->refcount, 1);
 }
 
@@ -9109,6 +9163,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->ctx_active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
@@ -10085,6 +10140,10 @@ perf_event_exit_event(struct perf_event *child_event,
 	if (parent_event)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
+
+	if (!parent_event && child_event->state == PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_del(parent_event, child_ctx);
+
 	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
-- 
2.11.0.390.gc69c2f50cf-goog

  reply	other threads:[~2017-01-10 10:25 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-10 10:24 [RFC 0/6] optimize ctx switch with rb-tree David Carrillo-Cisneros
2017-01-10 10:24 ` David Carrillo-Cisneros [this message]
2017-01-10 13:49   ` [RFC 1/6] perf/core: create active and inactive event groups Mark Rutland
2017-01-10 20:45     ` David Carrillo-Cisneros
2017-01-12 11:05       ` Mark Rutland
     [not found]         ` <CALcN6mhPmpSqKhE3Ua+j-xROLzeAyrgdCk4AGGtfF9kExXRTJg@mail.gmail.com>
2017-01-13 11:01           ` Mark Rutland
2017-01-10 10:24 ` [RFC 2/6] perf/core: add a rb-tree index to inactive_groups David Carrillo-Cisneros
2017-01-10 14:14   ` Mark Rutland
2017-01-10 20:20     ` David Carrillo-Cisneros
2017-01-12 11:47       ` Mark Rutland
2017-01-13  7:34         ` David Carrillo-Cisneros
2017-01-16  2:03   ` [lkp-developer] [perf/core] 33da94bd89: BUG:unable_to_handle_kernel kernel test robot
2017-01-10 10:24 ` [RFC 3/6] perf/core: use rb-tree to sched in event groups David Carrillo-Cisneros
2017-01-10 16:38   ` Mark Rutland
2017-01-10 20:51     ` David Carrillo-Cisneros
2017-01-12 12:14       ` Mark Rutland
2017-01-13  8:01         ` David Carrillo-Cisneros
2017-01-13 10:24           ` Mark Rutland
2017-01-11 20:31     ` Liang, Kan
2017-01-12 10:11       ` Mark Rutland
2017-01-12 13:28         ` Liang, Kan
2017-01-13  8:05           ` David Carrillo-Cisneros
2017-01-10 10:25 ` [RFC 4/6] perf/core: avoid rb-tree traversal when no inactive events David Carrillo-Cisneros
2017-01-10 10:25 ` [RFC 5/6] perf/core: rotation no longer necessary. Behavior has changed. Beware David Carrillo-Cisneros
2017-01-10 10:25 ` [RFC 6/6] perf/core: use rb-tree index to optimize filtered perf_iterate_ctx David Carrillo-Cisneros
2017-01-16  2:05   ` [lkp-developer] [perf/core] 49c04ee1a7: WARNING:at_kernel/events/core.c:#perf_iterate_ctx_matching kernel test robot
2017-04-25 17:27 ` [RFC 0/6] optimize ctx switch with rb-tree Liang, Kan
2017-04-25 17:49   ` David Carrillo-Cisneros
2017-04-25 18:11     ` Budankov, Alexey
2017-04-25 18:54       ` David Carrillo-Cisneros
2017-04-26 10:34         ` Budankov, Alexey
2017-04-26 19:40           ` David Carrillo-Cisneros
2017-04-26 10:52         ` Mark Rutland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170110102502.106187-2-davidcc@google.com \
    --to=davidcc@google.com \
    --cc=acme@kernel.org \
    --cc=ak@linux.intel.com \
    --cc=bp@suse.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=eranian@google.com \
    --cc=kan.liang@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=srinivas.pandruvada@linux.intel.com \
    --cc=tglx@linutronix.de \
    --cc=vikas.shivappa@linux.intel.com \
    --cc=vince@deater.net \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox