[PATCH v8 0/2] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v8 0/2] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
@ 2017-08-24 10:16 Alexey Budankov
  2017-08-24 10:18 ` [PATCH v8 1/2] perf/core: use rb trees for pinned/flexible groups Alexey Budankov
  2017-08-24 10:19 ` [PATCH v8 2/2] perf/core: add rotation switch to skip to the current CPU's events list on mux interrupt Alexey Budankov
  0 siblings, 2 replies; 3+ messages in thread
From: Alexey Budankov @ 2017-08-24 10:16 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: Andi Kleen, Kan Liang, Mark Rutland, David Carrillo-Cisneros,
	Stephane Eranian, Dmitri Prokhorov, Valery Cherepennikov,
	linux-kernel

Hi,

This patch set v8 addresses feedback captured on review of v7. For more 
background details please refer to v7 and earlier.

Specifically in this version mux switch is renamed to rotation switch 
and the value of rotation switch may be the one of defined by 
ROTATION_DISABLED or ROTATION_ENABLED macros.

ctx->rotate_disable bool flag is renamed to ctx->rotation and also employs
ROTATION_DISABLE and ROTATION_ENABLED macros.

pinned_group_sched_in() and flexible_group_sched_in() API are introduced 
to consolidate code enabling a whole group from pinned and flexible 
groups appropriately.

Patches in the set are expected to be applied one after another in 
the mentioned order and they are logically split into two parts 
to simplify the review process.

The patch set was successfully tested over a night using perf_fuzzer tool.

Thanks,
Alexey

---
 Alexey Budankov (2):
	perf/core: use rb trees for pinned/flexible groups
	perf/core: add rotation switch to skip to the current CPU's events list on mux interrupt

 include/linux/perf_event.h |  21 ++-
 kernel/events/core.c       | 456 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 362 insertions(+), 115 deletions(-)

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v8 1/2] perf/core: use rb trees for pinned/flexible groups
  2017-08-24 10:16 [PATCH v8 0/2] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
@ 2017-08-24 10:18 ` Alexey Budankov
  2017-08-24 10:19 ` [PATCH v8 2/2] perf/core: add rotation switch to skip to the current CPU's events list on mux interrupt Alexey Budankov
  1 sibling, 0 replies; 3+ messages in thread
From: Alexey Budankov @ 2017-08-24 10:18 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: Andi Kleen, Kan Liang, Mark Rutland, David Carrillo-Cisneros,
	Stephane Eranian, Dmitri Prokhorov, Valery Cherepennikov,
	linux-kernel

This patch moves event groups into rb tree sorted by CPU, so that 
multiplexing hrtimer interrupt handler would be able skipping to the current 
CPU's list and ignore groups allocated for the other CPUs.

New API for manipulating event groups in the trees is implemented as well 
as adoption on the API in the current implementation.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
 include/linux/perf_event.h |  19 +++-
 kernel/events/core.c       | 220 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 202 insertions(+), 37 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b14095b..cc07904 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -572,7 +572,20 @@ struct perf_event {
 	 */
 	struct list_head		group_entry;
 	struct list_head		sibling_list;
-
+	/*
+	 * Node on the pinned or flexible tree located at the event context;
+	 * the node may be empty in case its event is not directly attached
+	 * to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct rb_node			group_node;
+	/*
+	 * List keeps groups allocated for the same cpu;
+	 * the list may be empty in case its event is not directly
+	 * attached to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct list_head		group_list;
 	/*
 	 * We need storage to track the entries in perf_pmu_migrate_context; we
 	 * cannot use the event_entry because of RCU and we want to keep the
@@ -741,8 +754,8 @@ struct perf_event_context {
 	struct mutex			mutex;
 
 	struct list_head		active_ctx_list;
-	struct list_head		pinned_groups;
-	struct list_head		flexible_groups;
+	struct rb_root			pinned_groups;
+	struct rb_root			flexible_groups;
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d704e23..aaf55ae 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1466,8 +1466,12 @@ static enum event_type_t get_event_type(struct perf_event *event)
 	return event_type;
 }
 
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits;
+ */
+static struct rb_root *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
 	if (event->attr.pinned)
 		return &ctx->pinned_groups;
@@ -1476,6 +1480,143 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
+{
+	struct perf_event *node_event;
+	struct rb_node *parent;
+	struct rb_node **node;
+
+	node = &groups->rb_node;
+	parent = *node;
+
+	while (*node) {
+		parent = *node;
+		node_event = container_of(*node,
+				struct perf_event, group_node);
+
+		if (event->cpu < node_event->cpu) {
+			node = &parent->rb_left;
+		} else if (event->cpu > node_event->cpu) {
+			node = &parent->rb_right;
+		} else {
+			list_add_tail(&event->group_entry,
+					&node_event->group_list);
+			return;
+		}
+	}
+
+	list_add_tail(&event->group_entry, &event->group_list);
+
+	rb_link_node(&event->group_node, parent, node);
+	rb_insert_color(&event->group_node, groups);
+}
+
+/*
+ * Helper function to insert event into the pinned or
+ * flexible groups;
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct rb_root *groups;
+
+	groups = get_event_groups(event, ctx);
+	perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it is replaced by the next group on the group's group_list.
+ */
+static void
+perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
+{
+	list_del_init(&event->group_entry);
+
+	if (!RB_EMPTY_NODE(&event->group_node)) {
+		if (!RB_EMPTY_ROOT(groups)) {
+			if (list_empty(&event->group_list)) {
+				rb_erase(&event->group_node, groups);
+			} else {
+				struct perf_event *next =
+					list_first_entry(&event->group_list,
+						struct perf_event, group_entry);
+				list_replace_init(&event->group_list,
+						&next->group_list);
+				rb_replace_node(&event->group_node,
+						&next->group_node, groups);
+			}
+		}
+		RB_CLEAR_NODE(&event->group_node);
+	}
+}
+
+/*
+ * Helper function to delete event from its groups;
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct rb_root *groups;
+
+	groups = get_event_groups(event, ctx);
+	perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Find group_list list by a cpu key.
+ */
+static struct list_head *
+perf_event_groups_get_list(struct rb_root *groups, int cpu)
+{
+	struct perf_event *node_event;
+	struct rb_node *node;
+
+	node = groups->rb_node;
+
+	while (node) {
+		node_event = container_of(node,
+				struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			return &node_event->group_list;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Find group list by a cpu key and rotate it.
+ */
+static void
+perf_event_groups_rotate(struct rb_root *groups, int cpu)
+{
+	struct list_head *group_list =
+			perf_event_groups_get_list(groups, cpu);
+
+	if (group_list)
+		list_rotate_left(group_list);
+}
+
+/*
+ * Iterate event groups thru the whole tree.
+ */
+#define perf_event_groups_for_each(event, iter, tree, node, list, link)	\
+	for (iter = rb_first(tree); iter; iter = rb_next(iter))		\
+		list_for_each_entry(event, &(rb_entry(iter,		\
+			typeof(*event), node)->list), link)
+
+/*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
@@ -1493,12 +1634,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * perf_group_detach can, at all times, locate all siblings.
 	 */
 	if (event->group_leader == event) {
-		struct list_head *list;
-
 		event->group_caps = event->event_caps;
-
-		list = ctx_group_list(event, ctx);
-		list_add_tail(&event->group_entry, list);
+		add_event_to_groups(event, ctx);
 	}
 
 	list_update_cgroup_event(event, ctx, true);
@@ -1689,7 +1826,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	list_del_rcu(&event->event_entry);
 
 	if (event->group_leader == event)
-		list_del_init(&event->group_entry);
+		del_event_from_groups(event, ctx);
 
 	update_group_times(event);
 
@@ -1709,7 +1846,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 static void perf_group_detach(struct perf_event *event)
 {
 	struct perf_event *sibling, *tmp;
-	struct list_head *list = NULL;
 
 	lockdep_assert_held(&event->ctx->lock);
 
@@ -1730,22 +1866,22 @@ static void perf_group_detach(struct perf_event *event)
 		goto out;
 	}
 
-	if (!list_empty(&event->group_entry))
-		list = &event->group_entry;
-
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
 	 * to whatever list we are on.
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
-		if (list)
-			list_move_tail(&sibling->group_entry, list);
 		sibling->group_leader = sibling;
 
 		/* Inherit group flags from the previous leader */
 		sibling->group_caps = event->group_caps;
 
+		if (!list_empty(&event->group_entry)) {
+			list_del_init(&sibling->group_entry);
+			add_event_to_groups(sibling, event->ctx);
+		}
+
 		WARN_ON_ONCE(sibling->ctx != event->ctx);
 	}
 
@@ -2744,7 +2880,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 {
 	int is_active = ctx->is_active;
 	struct perf_event *event;
-
+	struct rb_node *node;
 	lockdep_assert_held(&ctx->lock);
 
 	if (likely(!ctx->nr_events)) {
@@ -2789,15 +2925,19 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 		return;
 
 	perf_pmu_disable(ctx->pmu);
-	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+
+	if (is_active & EVENT_PINNED)
+		perf_event_groups_for_each(event, node,
+				&ctx->pinned_groups, group_node,
+				group_list, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-	}
 
-	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+	if (is_active & EVENT_FLEXIBLE)
+		perf_event_groups_for_each(event, node,
+				&ctx->flexible_groups, group_node,
+				group_list, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-	}
+
 	perf_pmu_enable(ctx->pmu);
 }
 
@@ -3095,8 +3235,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		    struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *event;
+	struct rb_node *node;
 
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+	perf_event_groups_for_each(event, node, &ctx->pinned_groups,
+			group_node, group_list, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
 		if (!event_filter_match(event))
@@ -3125,9 +3267,12 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		      struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *event;
+	struct rb_node *node;
 	int can_add_hw = 1;
 
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+	perf_event_groups_for_each(event, node, &ctx->flexible_groups,
+			group_node, group_list, group_entry) {
+
 		/* Ignore events in OFF or ERROR state */
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
@@ -3156,7 +3301,6 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct task_struct *task)
 {
 	int is_active = ctx->is_active;
-	u64 now;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3175,8 +3319,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	if (is_active & EVENT_TIME) {
 		/* start ctx time */
-		now = perf_clock();
-		ctx->timestamp = now;
+		ctx->timestamp = perf_clock();
 		perf_cgroup_set_timestamp(task, ctx);
 	}
 
@@ -3227,7 +3370,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * However, if task's ctx is not carrying any pinned
 	 * events, no need to flip the cpuctx's events around.
 	 */
-	if (!list_empty(&ctx->pinned_groups))
+	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
@@ -3464,8 +3607,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	 * Rotate the first entry last of non-pinned groups. Rotation might be
 	 * disabled by the inheritance code.
 	 */
-	if (!ctx->rotate_disable)
-		list_rotate_left(&ctx->flexible_groups);
+	if (!ctx->rotate_disable) {
+		int sw = -1, cpu = smp_processor_id();
+
+		perf_event_groups_rotate(&ctx->flexible_groups, sw);
+		perf_event_groups_rotate(&ctx->flexible_groups, cpu);
+	}
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3804,8 +3951,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->active_ctx_list);
-	INIT_LIST_HEAD(&ctx->pinned_groups);
-	INIT_LIST_HEAD(&ctx->flexible_groups);
+	ctx->pinned_groups = RB_ROOT;
+	ctx->flexible_groups = RB_ROOT;
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 }
@@ -9412,6 +9559,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->group_entry);
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
+	RB_CLEAR_NODE(&event->group_node);
+	INIT_LIST_HEAD(&event->group_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);
@@ -10859,7 +11008,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		 * First allocate and initialize a context for the
 		 * child.
 		 */
-		child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+		child_ctx = alloc_perf_context(parent_ctx->pmu,	child);
 		if (!child_ctx)
 			return -ENOMEM;
 
@@ -10883,6 +11032,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
 	struct perf_event *event;
+	struct rb_node *node;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
 	unsigned long flags;
@@ -10916,7 +11066,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+	perf_event_groups_for_each(event, node,	&parent_ctx->pinned_groups,
+			group_node, group_list, group_entry) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
 		if (ret)
@@ -10932,7 +11083,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	parent_ctx->rotate_disable = 1;
 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
-	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+	perf_event_groups_for_each(event, node,	&parent_ctx->flexible_groups,
+			group_node, group_list, group_entry) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
 		if (ret)

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH v8 2/2] perf/core: add rotation switch to skip to the current CPU's events list on mux interrupt
  2017-08-24 10:16 [PATCH v8 0/2] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
  2017-08-24 10:18 ` [PATCH v8 1/2] perf/core: use rb trees for pinned/flexible groups Alexey Budankov
@ 2017-08-24 10:19 ` Alexey Budankov
  1 sibling, 0 replies; 3+ messages in thread
From: Alexey Budankov @ 2017-08-24 10:19 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: Andi Kleen, Kan Liang, Mark Rutland, David Carrillo-Cisneros,
	Stephane Eranian, Dmitri Prokhorov, Valery Cherepennikov,
	linux-kernel

This patch implements rotation switch that triggers skipping to the 
current CPU's events list at mulitplexing hrtimer interrupt 
handler as well as adoption of the switch in the existing 
implementation.

The value of rotation switch may be the one of defined by 
ROTATION_DISABLED or ROTATION_ENABLED macros.

ctx->rotate_disable bool flag is renamed to ctx->rotation 
and also employs ROTATION_DISABLE and ROTATION_ENABLED macros.

perf_event_groups_iterate_cpu() API is introduced to implement 
iteration through the certain CPU groups list skipping groups 
allocated for the other CPUs.

pinned_group_sched_in() and flexible_group_sched_in() API are
introduced to consolidate code enabling the whole group
from pinned and flexible groups appropriately.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
 include/linux/perf_event.h |   2 +-
 kernel/events/core.c       | 272 +++++++++++++++++++++++++++++----------------
 2 files changed, 178 insertions(+), 96 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cc07904..b41ce8f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -762,7 +762,7 @@ struct perf_event_context {
 	int				is_active;
 	int				nr_stat;
 	int				nr_freq;
-	int				rotate_disable;
+	int				rotation;
 	atomic_t			refcount;
 	struct task_struct		*task;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aaf55ae..dcba003 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -430,6 +430,9 @@ static void update_perf_cpu_limits(void)
 	WRITE_ONCE(perf_sample_allowed_ns, tmp);
 }
 
+#define ROTATION_DISABLED 0
+#define ROTATION_ENABLED  1
+
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 
 int perf_proc_update_handler(struct ctl_table *table, int write,
@@ -556,11 +559,11 @@ void perf_sample_event_took(u64 sample_len_ns)
 static atomic64_t perf_event_id;
 
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type);
+			      enum event_type_t event_type, int rotation);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 			     enum event_type_t event_type,
-			     struct task_struct *task);
+			     struct task_struct *task, int rotation);
 
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
@@ -717,7 +720,8 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 		perf_pmu_disable(cpuctx->ctx.pmu);
 
 		if (mode & PERF_CGROUP_SWOUT) {
-			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL,
+					ROTATION_DISABLED);
 			/*
 			 * must not be done before ctxswout due
 			 * to event_filter_match() in event_sched_out()
@@ -736,7 +740,8 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 			 */
 			cpuctx->cgrp = perf_cgroup_from_task(task,
 							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task,
+					ROTATION_DISABLED);
 		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1617,6 +1622,14 @@ perf_event_groups_rotate(struct rb_root *groups, int cpu)
 			typeof(*event), node)->list), link)
 
 /*
+ * Iterate event groups related to specific cpu.
+ */
+#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link)	\
+	list = perf_event_groups_get_list(tree, cpu);			\
+	if (list)							\
+		list_for_each_entry(event, list, link)
+
+/*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
@@ -2397,12 +2410,12 @@ static void add_event_to_ctx(struct perf_event *event,
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type);
+			  enum event_type_t event_type, int rotation);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
-	     struct task_struct *task);
+	     struct task_struct *task, int rotation);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			       struct perf_event_context *ctx,
@@ -2414,19 +2427,19 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, cpuctx, event_type);
+	ctx_sched_out(ctx, cpuctx, event_type, ROTATION_DISABLED);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 				struct perf_event_context *ctx,
-				struct task_struct *task)
+				struct task_struct *task, int rotation)
 {
-	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, rotation);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, rotation);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, rotation);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, rotation);
 }
 
 /*
@@ -2470,11 +2483,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	 *  - otherwise, do nothing more.
 	 */
 	if (cpu_event)
-		cpu_ctx_sched_out(cpuctx, ctx_event_type);
+		cpu_ctx_sched_out(cpuctx, ctx_event_type, ROTATION_DISABLED);
 	else if (ctx_event_type & EVENT_PINNED)
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, ROTATION_DISABLED);
 
-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx, current, ROTATION_DISABLED);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
 
@@ -2518,7 +2531,7 @@ static int  __perf_install_in_context(void *info)
 	}
 
 	if (reprogram) {
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, ROTATION_DISABLED);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx, get_event_type(event));
 	} else {
@@ -2661,7 +2674,7 @@ static void __perf_event_enable(struct perf_event *event,
 		return;
 
 	if (ctx->is_active)
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, ROTATION_DISABLED);
 
 	__perf_event_mark_enabled(event);
 
@@ -2671,7 +2684,8 @@ static void __perf_event_enable(struct perf_event *event,
 	if (!event_filter_match(event)) {
 		if (is_cgroup_event(event))
 			perf_cgroup_defer_enabled(event);
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current,
+				ROTATION_DISABLED);
 		return;
 	}
 
@@ -2680,7 +2694,8 @@ static void __perf_event_enable(struct perf_event *event,
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current,
+				ROTATION_DISABLED);
 		return;
 	}
 
@@ -2876,11 +2891,14 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type)
+			  enum event_type_t event_type,
+			  int rotation)
 {
 	int is_active = ctx->is_active;
+	struct list_head *group_list;
 	struct perf_event *event;
 	struct rb_node *node;
+	int sw = -1, cpu = smp_processor_id();
 	lockdep_assert_held(&ctx->lock);
 
 	if (likely(!ctx->nr_events)) {
@@ -2926,17 +2944,41 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	perf_pmu_disable(ctx->pmu);
 
-	if (is_active & EVENT_PINNED)
-		perf_event_groups_for_each(event, node,
-				&ctx->pinned_groups, group_node,
-				group_list, group_entry)
-			group_sched_out(event, cpuctx, ctx);
+	if (is_active & EVENT_PINNED) {
+		if (rotation == ROTATION_ENABLED) {
+			perf_event_groups_for_each_cpu(event, cpu,
+					&ctx->pinned_groups,
+					group_list, group_entry)
+				group_sched_out(event, cpuctx, ctx);
+			perf_event_groups_for_each_cpu(event, sw,
+					&ctx->pinned_groups,
+					group_list, group_entry)
+				group_sched_out(event, cpuctx, ctx);
+		} else {
+			perf_event_groups_for_each(event, node,
+					&ctx->pinned_groups, group_node,
+					group_list, group_entry)
+				group_sched_out(event, cpuctx, ctx);
+		}
+	}
 
-	if (is_active & EVENT_FLEXIBLE)
-		perf_event_groups_for_each(event, node,
-				&ctx->flexible_groups, group_node,
-				group_list, group_entry)
-			group_sched_out(event, cpuctx, ctx);
+	if (is_active & EVENT_FLEXIBLE) {
+		if (rotation == ROTATION_ENABLED) {
+			perf_event_groups_for_each_cpu(event, cpu,
+					&ctx->flexible_groups,
+					group_list, group_entry)
+				group_sched_out(event, cpuctx, ctx);
+			perf_event_groups_for_each_cpu(event, sw,
+					&ctx->flexible_groups,
+					group_list, group_entry)
+				group_sched_out(event, cpuctx, ctx);
+		} else {
+			perf_event_groups_for_each(event, node,
+					&ctx->flexible_groups, group_node,
+					group_list, group_entry)
+				group_sched_out(event, cpuctx, ctx);
+		}
+	}
 
 	perf_pmu_enable(ctx->pmu);
 }
@@ -3225,72 +3267,110 @@ void __perf_event_task_sched_out(struct task_struct *task,
  * Called with IRQs disabled
  */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type)
+			      enum event_type_t event_type, int rotation)
+{
+	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, rotation);
+}
+
+static void
+pinned_group_sched_in(struct perf_event *event,
+		      struct perf_event_context *ctx,
+		      struct perf_cpu_context *cpuctx)
 {
-	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return;
+	if (!event_filter_match(event))
+		return;
+
+	/* may need to reset tstamp_enabled */
+	if (is_cgroup_event(event))
+		perf_cgroup_mark_enabled(event, ctx);
+
+	if (group_can_go_on(event, cpuctx, 1))
+		group_sched_in(event, cpuctx, ctx);
+
+	/*
+	 * If this pinned group hasn't been scheduled,
+	 * put it in error state.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_ERROR;
+	}
 }
 
 static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
+		    struct perf_cpu_context *cpuctx, int rotation)
 {
+	struct list_head *group_list;
 	struct perf_event *event;
 	struct rb_node *node;
+	int sw = -1, cpu = smp_processor_id();
 
-	perf_event_groups_for_each(event, node, &ctx->pinned_groups,
-			group_node, group_list, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		if (!event_filter_match(event))
-			continue;
+	if (rotation == ROTATION_ENABLED) {
+		perf_event_groups_for_each_cpu(event, sw,
+			&ctx->pinned_groups, group_list, group_entry)
+			pinned_group_sched_in(event, ctx, cpuctx);
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
+		perf_event_groups_for_each_cpu(event, cpu,
+			&ctx->pinned_groups, group_list, group_entry)
+			pinned_group_sched_in(event, ctx, cpuctx);
+	} else {
+		perf_event_groups_for_each(event, node, &ctx->pinned_groups,
+			group_node, group_list, group_entry)
+			pinned_group_sched_in(event, ctx, cpuctx);
+	}
+}
 
-		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx);
+static void
+flexible_group_sched_in(struct perf_event *event,
+			struct perf_event_context *ctx,
+			struct perf_cpu_context *cpuctx,
+			int *can_add_hw)
+{
+	/* Ignore events in OFF or ERROR state */
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return;
+	/*
+	 * Listen to the 'cpu' scheduling filter constraint
+	 * of events:
+	 */
+	if (!event_filter_match(event))
+		return;
 
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE) {
-			update_group_times(event);
-			event->state = PERF_EVENT_STATE_ERROR;
-		}
+	/* may need to reset tstamp_enabled */
+	if (is_cgroup_event(event))
+		perf_cgroup_mark_enabled(event, ctx);
+
+	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+		if (group_sched_in(event, cpuctx, ctx))
+			*can_add_hw = 0;
 	}
 }
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
+		      struct perf_cpu_context *cpuctx, int rotation)
 {
+	struct list_head *group_list;
 	struct perf_event *event;
 	struct rb_node *node;
 	int can_add_hw = 1;
-
-	perf_event_groups_for_each(event, node, &ctx->flexible_groups,
-			group_node, group_list, group_entry) {
-
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of events:
-		 */
-		if (!event_filter_match(event))
-			continue;
-
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
-		if (group_can_go_on(event, cpuctx, can_add_hw)) {
-			if (group_sched_in(event, cpuctx, ctx))
-				can_add_hw = 0;
-		}
+	int sw = -1, cpu = smp_processor_id();
+
+	if (rotation == ROTATION_ENABLED) {
+		perf_event_groups_for_each_cpu(event, sw,
+			&ctx->flexible_groups, group_list, group_entry)
+			flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+		can_add_hw = 1;
+		perf_event_groups_for_each_cpu(event, cpu,
+			&ctx->flexible_groups, group_list, group_entry)
+			flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+	} else {
+		perf_event_groups_for_each(event, node, &ctx->flexible_groups,
+			group_node, group_list, group_entry)
+			flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
 	}
 }
 
@@ -3298,7 +3378,7 @@ static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
-	     struct task_struct *task)
+	     struct task_struct *task, int rotation)
 {
 	int is_active = ctx->is_active;
 
@@ -3328,20 +3408,20 @@ ctx_sched_in(struct perf_event_context *ctx,
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		ctx_pinned_sched_in(ctx, cpuctx, rotation);
 
 	/* Then walk through the lower prio flexible groups */
 	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+		ctx_flexible_sched_in(ctx, cpuctx, rotation);
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 			     enum event_type_t event_type,
-			     struct task_struct *task)
+			     struct task_struct *task, int rotation)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type, task);
+	ctx_sched_in(ctx, cpuctx, event_type, task, rotation);
 }
 
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
@@ -3371,8 +3451,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * events, no need to flip the cpuctx's events around.
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-	perf_event_sched_in(cpuctx, ctx, task);
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, ROTATION_DISABLED);
+	perf_event_sched_in(cpuctx, ctx, task, ROTATION_DISABLED);
 	perf_pmu_enable(ctx->pmu);
 
 unlock:
@@ -3607,7 +3687,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	 * Rotate the first entry last of non-pinned groups. Rotation might be
 	 * disabled by the inheritance code.
 	 */
-	if (!ctx->rotate_disable) {
+	if (ctx->rotation == ROTATION_ENABLED) {
 		int sw = -1, cpu = smp_processor_id();
 
 		perf_event_groups_rotate(&ctx->flexible_groups, sw);
@@ -3618,40 +3698,40 @@ static void rotate_ctx(struct perf_event_context *ctx)
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event_context *ctx = NULL;
-	int rotate = 0;
+	int rotation = ROTATION_DISABLED;
 
 	if (cpuctx->ctx.nr_events) {
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-			rotate = 1;
+			rotation = ROTATION_ENABLED;
 	}
 
 	ctx = cpuctx->task_ctx;
 	if (ctx && ctx->nr_events) {
 		if (ctx->nr_events != ctx->nr_active)
-			rotate = 1;
+			rotation = ROTATION_ENABLED;
 	}
 
-	if (!rotate)
+	if (rotation == ROTATION_DISABLED)
 		goto done;
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_pmu_disable(cpuctx->ctx.pmu);
 
-	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, rotation);
 	if (ctx)
-		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, rotation);
 
 	rotate_ctx(&cpuctx->ctx);
 	if (ctx)
 		rotate_ctx(ctx);
 
-	perf_event_sched_in(cpuctx, ctx, current);
+	perf_event_sched_in(cpuctx, ctx, current, rotation);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
 
-	return rotate;
+	return rotation;
 }
 
 void perf_event_task_tick(void)
@@ -3705,7 +3785,7 @@ static void perf_event_enable_on_exec(int ctxn)
 
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME, ROTATION_DISABLED);
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
 		event_type |= get_event_type(event);
@@ -3718,7 +3798,8 @@ static void perf_event_enable_on_exec(int ctxn)
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, event_type);
 	} else {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current,
+				ROTATION_DISABLED);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -3955,6 +4036,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	ctx->flexible_groups = RB_ROOT;
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
+	ctx->rotation = ROTATION_ENABLED;
 }
 
 static struct perf_event_context *
@@ -11080,7 +11162,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 * rotate_ctx() will change the list from interrupt context.
 	 */
 	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
-	parent_ctx->rotate_disable = 1;
+	parent_ctx->rotation = ROTATION_DISABLED;
 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
 	perf_event_groups_for_each(event, node,	&parent_ctx->flexible_groups,
@@ -11092,7 +11174,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	}
 
 	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
-	parent_ctx->rotate_disable = 0;
+	parent_ctx->rotation = ROTATION_ENABLED;
 
 	child_ctx = child->perf_event_ctxp[ctxn];
 

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-08-24 10:19 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-08-24 10:16 [PATCH v8 0/2] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
2017-08-24 10:18 ` [PATCH v8 1/2] perf/core: use rb trees for pinned/flexible groups Alexey Budankov
2017-08-24 10:19 ` [PATCH v8 2/2] perf/core: add rotation switch to skip to the current CPU's events list on mux interrupt Alexey Budankov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox