All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Carrillo-Cisneros <davidcc@google.com>
To: linux-kernel@vger.kernel.org
Cc: "x86@kernel.org" <x86@kernel.org>, Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andi Kleen <ak@linux.intel.com>, Kan Liang <kan.liang@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Borislav Petkov <bp@suse.de>,
	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Vikas Shivappa <vikas.shivappa@linux.intel.com>,
	Mark Rutland <mark.rutland@arm.com>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Vince Weaver <vince@deater.net>, Paul Turner <pjt@google.com>,
	Stephane Eranian <eranian@google.com>,
	David Carrillo-Cisneros <davidcc@google.com>
Subject: [RFC 1/6] perf/core: create active and inactive event groups
Date: Tue, 10 Jan 2017 02:24:57 -0800	[thread overview]
Message-ID: <20170110102502.106187-2-davidcc@google.com> (raw)
In-Reply-To: <20170110102502.106187-1-davidcc@google.com>

Currently, perf uses pinned_groups and flexible_groups for sched in/out.
We can do better because:
  - sched out only cares about the ACTIVE events, this is usually a small
  set of events.
  - There can be many events in these lists thate are no relevant to
  the scheduler (e.g. other CPU/cgroups, events in OFF and ERROR state).

Reduce the set of events to iterate over each context switch by adding
three new lists: active_pinned_groups, active_flexible_groups and
inactive_groups. All events in each list are in the same state so we
avoid checking state. It also saves the iteration over events in OFF and
ERROR state during sched in/out.

The main impact of this patch is that ctx_sched_out can use the "small"
active_{pinned,flexible}_groups instead of the potentially much larger
{pinned,flexible}_groups.

There is no pinned/flexible version of inactive event because next patches
will create an index on them.

Bookkeeping of the new lists is more involved, but it can provide a
potentially large speed up. The new lists are intended to eventually
replace {pinned,flexible}_groups, although that's not yet implemented.

The inactive list is kept in FIFO order and only added after
group_sched_in has succeded. This guarantees that it's in timestamp
order.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
---
 include/linux/perf_event.h |  6 +++
 kernel/events/core.c       | 93 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..3fa18f05c9b0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -573,6 +573,7 @@ struct perf_event {
 
 	struct hlist_node		hlist_entry;
 	struct list_head		active_entry;
+	struct list_head		ctx_active_entry;
 	int				nr_siblings;
 
 	/* Not serialized. Only written during event initialization. */
@@ -734,6 +735,11 @@ struct perf_event_context {
 	struct list_head		active_ctx_list;
 	struct list_head		pinned_groups;
 	struct list_head		flexible_groups;
+
+	struct list_head		active_pinned_groups;
+	struct list_head		active_flexible_groups;
+	struct list_head		inactive_groups;
+
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index faf073d0287f..b744b5a8dbd0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1462,6 +1462,21 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 		return &ctx->flexible_groups;
 }
 
+static void
+ctx_sched_groups_to_inactive(struct perf_event *event,
+			     struct perf_event_context *ctx)
+{
+	WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE);
+	list_move_tail(&event->ctx_active_entry, &ctx->inactive_groups);
+};
+
+static void
+ctx_sched_groups_add(struct perf_event *event, struct perf_event_context *ctx)
+{
+	WARN_ON(!list_empty(&event->ctx_active_entry));
+	list_add_tail(&event->ctx_active_entry, &ctx->inactive_groups);
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1487,10 +1502,11 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
 		list = ctx_group_list(event, ctx);
 		list_add_tail(&event->group_entry, list);
+		if (event->state == PERF_EVENT_STATE_INACTIVE)
+			ctx_sched_groups_add(event, ctx);
 	}
 
 	list_update_cgroup_event(event, ctx, true);
-
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
@@ -1648,6 +1664,13 @@ static void perf_group_attach(struct perf_event *event)
 		perf_event__header_size(pos);
 }
 
+static void ctx_sched_groups_del(struct perf_event *group,
+				 struct perf_event_context *ctx)
+{
+	WARN_ON(group->state != PERF_EVENT_STATE_INACTIVE);
+	list_del_init(&group->ctx_active_entry);
+}
+
 /*
  * Remove a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1674,8 +1697,11 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	list_del_rcu(&event->event_entry);
 
-	if (event->group_leader == event)
+	if (event->group_leader == event) {
+		if (event->state == PERF_EVENT_STATE_INACTIVE)
+			ctx_sched_groups_del(event, ctx);
 		list_del_init(&event->group_entry);
+	}
 
 	update_group_times(event);
 
@@ -1851,6 +1877,11 @@ group_sched_out(struct perf_event *group_event,
 
 	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
 		cpuctx->exclusive = 0;
+
+	if (group_event->state <= PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_to_inactive(group_event, ctx);
+	if (group_event->state < PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_del(group_event, ctx);
 }
 
 #define DETACH_GROUP	0x01UL
@@ -1918,6 +1949,8 @@ static void __perf_event_disable(struct perf_event *event,
 		group_sched_out(event, cpuctx, ctx);
 	else
 		event_sched_out(event, cpuctx, ctx);
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_del(event, ctx);
 	event->state = PERF_EVENT_STATE_OFF;
 }
 
@@ -2014,6 +2047,17 @@ static void perf_set_shadow_time(struct perf_event *event,
 static void perf_log_throttle(struct perf_event *event, int enable);
 static void perf_log_itrace_start(struct perf_event *event);
 
+static void
+ctx_sched_groups_to_active(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct list_head *h = event->attr.pinned ? &ctx->active_pinned_groups :
+						   &ctx->active_flexible_groups;
+	WARN_ON(!event);
+	WARN_ON(list_empty(&event->ctx_active_entry));
+	WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
+	list_move_tail(&event->ctx_active_entry, h);
+}
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -2091,9 +2135,7 @@ group_sched_in(struct perf_event *group_event,
 	u64 now = ctx->time;
 	bool simulate = false;
 
-	if (group_event->state == PERF_EVENT_STATE_OFF)
-		return 0;
-
+	WARN_ON(group_event->state != PERF_EVENT_STATE_INACTIVE);
 	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
@@ -2112,9 +2154,10 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!pmu->commit_txn(pmu))
+	if (!pmu->commit_txn(pmu)) {
+		ctx_sched_groups_to_active(group_event, ctx);
 		return 0;
-
+	}
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
@@ -2396,6 +2439,7 @@ static void __perf_event_enable(struct perf_event *event,
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 
 	__perf_event_mark_enabled(event);
+	ctx_sched_groups_add(event, ctx);
 
 	if (!ctx->is_active)
 		return;
@@ -2611,7 +2655,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			  enum event_type_t event_type)
 {
 	int is_active = ctx->is_active;
-	struct perf_event *event;
+	struct perf_event *event, *tmp;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -2658,13 +2702,17 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+		list_for_each_entry_safe(event, tmp, &ctx->active_pinned_groups, ctx_active_entry) {
+			WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
 			group_sched_out(event, cpuctx, ctx);
+		}
 	}
 
 	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+		list_for_each_entry_safe(event, tmp, &ctx->active_flexible_groups, ctx_active_entry) {
+			WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
 			group_sched_out(event, cpuctx, ctx);
+		}
 	}
 	perf_pmu_enable(ctx->pmu);
 }
@@ -2962,10 +3010,11 @@ static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
 		    struct perf_cpu_context *cpuctx)
 {
-	struct perf_event *event;
+	struct perf_event *event = NULL, *tmp;
 
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
+	list_for_each_entry_safe(
+			event, tmp, &ctx->inactive_groups, ctx_active_entry) {
+		if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
 			continue;
 		if (!event_filter_match(event))
 			continue;
@@ -2983,6 +3032,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		 */
 		if (event->state == PERF_EVENT_STATE_INACTIVE) {
 			update_group_times(event);
+			ctx_sched_groups_del(event, ctx);
 			event->state = PERF_EVENT_STATE_ERROR;
 		}
 	}
@@ -2992,12 +3042,12 @@ static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
 		      struct perf_cpu_context *cpuctx)
 {
-	struct perf_event *event;
+	struct perf_event *event = NULL, *tmp;
 	int can_add_hw = 1;
 
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
+	list_for_each_entry_safe(
+			event, tmp, &ctx->inactive_groups, ctx_active_entry) {
+		if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
 			continue;
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
@@ -3389,6 +3439,7 @@ static int event_enable_on_exec(struct perf_event *event,
 		return 0;
 
 	__perf_event_mark_enabled(event);
+	ctx_sched_groups_add(event, ctx);
 
 	return 1;
 }
@@ -3639,6 +3690,9 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	INIT_LIST_HEAD(&ctx->pinned_groups);
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
+	INIT_LIST_HEAD(&ctx->active_pinned_groups);
+	INIT_LIST_HEAD(&ctx->active_flexible_groups);
+	INIT_LIST_HEAD(&ctx->inactive_groups);
 	atomic_set(&ctx->refcount, 1);
 }
 
@@ -9109,6 +9163,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->ctx_active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
@@ -10085,6 +10140,10 @@ perf_event_exit_event(struct perf_event *child_event,
 	if (parent_event)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
+
+	if (!parent_event && child_event->state == PERF_EVENT_STATE_INACTIVE)
+		ctx_sched_groups_del(parent_event, child_ctx);
+
 	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
-- 
2.11.0.390.gc69c2f50cf-goog

  reply	other threads:[~2017-01-10 10:25 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-10 10:24 [RFC 0/6] optimize ctx switch with rb-tree David Carrillo-Cisneros
2017-01-10 10:24 ` David Carrillo-Cisneros [this message]
2017-01-10 13:49   ` [RFC 1/6] perf/core: create active and inactive event groups Mark Rutland
2017-01-10 20:45     ` David Carrillo-Cisneros
2017-01-12 11:05       ` Mark Rutland
     [not found]         ` <CALcN6mhPmpSqKhE3Ua+j-xROLzeAyrgdCk4AGGtfF9kExXRTJg@mail.gmail.com>
2017-01-13 11:01           ` Mark Rutland
2017-01-10 10:24 ` [RFC 2/6] perf/core: add a rb-tree index to inactive_groups David Carrillo-Cisneros
2017-01-10 14:14   ` Mark Rutland
2017-01-10 20:20     ` David Carrillo-Cisneros
2017-01-12 11:47       ` Mark Rutland
2017-01-13  7:34         ` David Carrillo-Cisneros
2017-01-16  2:03   ` [lkp-developer] [perf/core] 33da94bd89: BUG:unable_to_handle_kernel kernel test robot
2017-01-16  2:03     ` kernel test robot
2017-01-10 10:24 ` [RFC 3/6] perf/core: use rb-tree to sched in event groups David Carrillo-Cisneros
2017-01-10 16:38   ` Mark Rutland
2017-01-10 20:51     ` David Carrillo-Cisneros
2017-01-12 12:14       ` Mark Rutland
2017-01-13  8:01         ` David Carrillo-Cisneros
2017-01-13 10:24           ` Mark Rutland
2017-01-11 20:31     ` Liang, Kan
2017-01-12 10:11       ` Mark Rutland
2017-01-12 13:28         ` Liang, Kan
2017-01-13  8:05           ` David Carrillo-Cisneros
2017-01-10 10:25 ` [RFC 4/6] perf/core: avoid rb-tree traversal when no inactive events David Carrillo-Cisneros
2017-01-10 10:25 ` [RFC 5/6] perf/core: rotation no longer necessary. Behavior has changed. Beware David Carrillo-Cisneros
2017-01-10 10:25 ` [RFC 6/6] perf/core: use rb-tree index to optimize filtered perf_iterate_ctx David Carrillo-Cisneros
2017-01-16  2:05   ` [lkp-developer] [perf/core] 49c04ee1a7: WARNING:at_kernel/events/core.c:#perf_iterate_ctx_matching kernel test robot
2017-01-16  2:05     ` kernel test robot
2017-04-25 17:27 ` [RFC 0/6] optimize ctx switch with rb-tree Liang, Kan
2017-04-25 17:49   ` David Carrillo-Cisneros
2017-04-25 18:11     ` Budankov, Alexey
2017-04-25 18:54       ` David Carrillo-Cisneros
2017-04-26 10:34         ` Budankov, Alexey
2017-04-26 19:40           ` David Carrillo-Cisneros
2017-04-26 10:52         ` Mark Rutland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170110102502.106187-2-davidcc@google.com \
    --to=davidcc@google.com \
    --cc=acme@kernel.org \
    --cc=ak@linux.intel.com \
    --cc=bp@suse.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=eranian@google.com \
    --cc=kan.liang@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=srinivas.pandruvada@linux.intel.com \
    --cc=tglx@linutronix.de \
    --cc=vikas.shivappa@linux.intel.com \
    --cc=vince@deater.net \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.