public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: kan.liang@linux.intel.com
To: peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com,
	linux-kernel@vger.kernel.org
Cc: eranian@google.com, tj@kernel.org, ak@linux.intel.com,
	Kan Liang <kan.liang@linux.intel.com>
Subject: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
Date: Mon, 29 Apr 2019 07:44:04 -0700	[thread overview]
Message-ID: <1556549045-71814-4-git-send-email-kan.liang@linux.intel.com> (raw)
In-Reply-To: <1556549045-71814-1-git-send-email-kan.liang@linux.intel.com>

From: Kan Liang <kan.liang@linux.intel.com>

Current RB tree for pinned/flexible groups doesn't take cgroup into
account. All events on a given CPU will be fed to
pinned/flexible_sched_in(), which relies on perf_cgroup_match() to
filter the events for a specific cgroup. The method has high overhead,
especially in frequent context switch with several events and cgroups
involved.

Add unique cgrp_id for each cgroup, which is composed by CPU ID and css
subsys-unique ID. The low 32bit of cgrp_id (css subsys-unique ID) is
used as part of complex key of RB tree.
Events in the same cgroup has the same cgrp_id.
The cgrp_id is always zero for non-cgroup case. There is no functional
change for non-cgroup case.

Add perf_event_groups_first_cgroup() and
perf_event_groups_next_cgroup(), which will be used later to traverse
the events for a specific cgroup on a given CPU.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 include/linux/perf_event.h |  6 ++++
 kernel/events/core.c       | 84 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 039e2f2..7eff286 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -703,6 +703,7 @@ struct perf_event {
 
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
+	u64				cgrp_id; /* perf cgroup ID */
 #endif
 
 	struct list_head		sb_list;
@@ -825,6 +826,9 @@ struct bpf_perf_event_data_kern {
 
 #ifdef CONFIG_CGROUP_PERF
 
+#define PERF_CGROUP_ID_MASK		0xffffffff
+#define cgrp_id_low(id)			(id & PERF_CGROUP_ID_MASK)
+
 /*
  * perf_cgroup_info keeps track of time_enabled for a cgroup.
  * This is a per-cpu dynamically allocated data structure.
@@ -837,6 +841,8 @@ struct perf_cgroup_info {
 struct perf_cgroup {
 	struct cgroup_subsys_state	css;
 	struct perf_cgroup_info	__percpu *info;
+	/* perf cgroup ID = (CPU ID << 32) | css subsys-unique ID */
+	u64 __percpu			*cgrp_id;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 782fd86..5ecc048 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -901,6 +901,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	struct cgroup_subsys_state *css;
 	struct fd f = fdget(fd);
 	int ret = 0;
+	u64 cgrp_id;
 
 	if (!f.file)
 		return -EBADF;
@@ -915,6 +916,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
 
+	cgrp_id = ((u64)smp_processor_id() << 32) | css->id;
+	event->cgrp_id = cgrp_id;
+	*per_cpu_ptr(cgrp->cgrp_id, event->cpu) = cgrp_id;
+
 	/*
 	 * all events in a group must monitor
 	 * the same cgroup because a task belongs
@@ -1494,6 +1499,9 @@ static void init_event_group(struct perf_event *event)
 {
 	RB_CLEAR_NODE(&event->group_node);
 	event->group_index = 0;
+#ifdef CONFIG_CGROUP_PERF
+	event->cgrp_id = 0;
+#endif
 }
 
 /*
@@ -1521,8 +1529,8 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
 /*
  * Compare function for event groups;
  *
- * Implements complex key that first sorts by CPU and then by virtual index
- * which provides ordering when rotating groups for the same CPU.
+ * Implements complex key that first sorts by CPU and cgroup ID, then by
+ * virtual index which provides ordering when rotating groups for the same CPU.
  */
 static bool
 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
@@ -1532,6 +1540,13 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 	if (left->cpu > right->cpu)
 		return false;
 
+#ifdef CONFIG_CGROUP_PERF
+	if (cgrp_id_low(left->cgrp_id) < cgrp_id_low(right->cgrp_id))
+		return true;
+	if (cgrp_id_low(left->cgrp_id) > cgrp_id_low(right->cgrp_id))
+		return false;
+#endif
+
 	if (left->group_index < right->group_index)
 		return true;
 	if (left->group_index > right->group_index)
@@ -1541,7 +1556,8 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 }
 
 /*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
+ * Insert @event into @groups' tree;
+ * Using {@event->cpu, @event->cgrp_id, ++@groups->index} for
  * key (see perf_event_groups_less). This places it last inside the CPU
  * subtree.
  */
@@ -1650,6 +1666,50 @@ perf_event_groups_next(struct perf_event *event)
 	return NULL;
 }
 
+#ifdef CONFIG_CGROUP_PERF
+
+static struct perf_event *
+perf_event_groups_first_cgroup(struct perf_event_groups *groups,
+			       int cpu, u64 cgrp_id)
+{
+	struct perf_event *node_event = NULL, *match = NULL;
+	struct rb_node *node = groups->tree.rb_node;
+
+	while (node) {
+		node_event = container_of(node, struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			if (cgrp_id_low(cgrp_id) < cgrp_id_low(node_event->cgrp_id))
+				node = node->rb_left;
+			else if (cgrp_id_low(cgrp_id) > cgrp_id_low(node_event->cgrp_id))
+				node = node->rb_right;
+			else {
+				match = node_event;
+				node = node->rb_left;
+				}
+			}
+		}
+		return match;
+}
+
+static struct perf_event *
+perf_event_groups_next_cgroup(struct perf_event *event)
+{
+	struct perf_event *next;
+
+	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
+	if (next && (next->cpu == event->cpu) && (next->cgrp_id == event->cgrp_id))
+		return next;
+
+	return NULL;
+}
+
+#endif
+
 /*
  * Iterate through the whole groups tree.
  */
@@ -12127,18 +12187,28 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return ERR_PTR(-ENOMEM);
 
 	jc->info = alloc_percpu(struct perf_cgroup_info);
-	if (!jc->info) {
-		kfree(jc);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!jc->info)
+		goto free_jc;
+
+	jc->cgrp_id = alloc_percpu(u64);
+	if (!jc->cgrp_id)
+		goto free_jc_info;
 
 	return &jc->css;
+
+free_jc_info:
+	free_percpu(jc->info);
+free_jc:
+	kfree(jc);
+
+	return ERR_PTR(-ENOMEM);
 }
 
 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
 
+	free_percpu(jc->cgrp_id);
 	free_percpu(jc->info);
 	kfree(jc);
 }
-- 
2.7.4


  parent reply	other threads:[~2019-04-29 14:45 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-29 14:44 [PATCH 0/4] Optimize cgroup context switch kan.liang
2019-04-29 14:44 ` [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring kan.liang
2019-04-29 15:04   ` Mark Rutland
2019-04-29 15:27     ` Liang, Kan
2019-04-30  8:56   ` Peter Zijlstra
2019-04-30 15:45     ` Liang, Kan
2019-04-29 14:44 ` [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in() kan.liang
2019-04-29 15:12   ` Mark Rutland
2019-04-29 15:31     ` Liang, Kan
2019-04-29 16:56       ` Mark Rutland
2019-04-29 14:44 ` kan.liang [this message]
2019-04-29 23:02   ` [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree Ian Rogers
2019-04-30  9:08     ` Peter Zijlstra
2019-04-30 15:46       ` Liang, Kan
2019-04-30  9:03   ` Peter Zijlstra
2019-04-30 15:46     ` Liang, Kan
2019-04-30  9:03   ` Peter Zijlstra
2019-04-29 14:44 ` [PATCH 4/4] perf cgroup: Add fast path for cgroup switch kan.liang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1556549045-71814-4-git-send-email-kan.liang@linux.intel.com \
    --to=kan.liang@linux.intel.com \
    --cc=ak@linux.intel.com \
    --cc=eranian@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox