[PATCH] perf-sched: add option to merge like comms to lat output

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] perf-sched: add option to merge like comms to lat output
@ 2015-05-21 21:25 Josef Bacik
  2015-05-22  6:27 ` Ingo Molnar
  0 siblings, 1 reply; 2+ messages in thread
From: Josef Bacik @ 2015-05-21 21:25 UTC (permalink / raw)
  To: a.p.zijlstra, mingo, acme, linux-kernel, kernel-team

Sometimes when debugging large multi-threaded applications it is helpful to
collate all of the latency numbers into one bulk record to get an idea of what
is going on.  This patch does this by merging any entries that belong to the
same comm into one entry and then spits out those totals.  I've also sligtly
changed the output so you can see how many threads were merged in the
processing.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 tools/perf/builtin-sched.c | 77 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5275bab..c74894f 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -95,6 +95,7 @@ struct work_atoms {
 	u64			total_lat;
 	u64			nb_atoms;
 	u64			total_runtime;
+	int			num_merged;
 };
 
 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
@@ -168,9 +169,10 @@ struct perf_sched {
 	u64		 all_runtime;
 	u64		 all_count;
 	u64		 cpu_last_switched[MAX_CPUS];
-	struct rb_root	 atom_root, sorted_atom_root;
+	struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
 	struct list_head sort_list, cmp_pid;
 	bool force;
+	bool merge_by_comm;
 };
 
 static u64 get_nsecs(void)
@@ -1156,7 +1158,10 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_
 	sched->all_runtime += work_list->total_runtime;
 	sched->all_count   += work_list->nb_atoms;
 
-	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
+	if (sched->merge_by_comm && work_list->num_merged > 1)
+		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
+	else
+		ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
 
 	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
@@ -1276,17 +1281,22 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 static void perf_sched__sort_lat(struct perf_sched *sched)
 {
 	struct rb_node *node;
-
+	struct rb_root *root = &sched->atom_root;
+again:
 	for (;;) {
 		struct work_atoms *data;
-		node = rb_first(&sched->atom_root);
+		node = rb_first(root);
 		if (!node)
 			break;
 
-		rb_erase(node, &sched->atom_root);
+		rb_erase(node, root);
 		data = rb_entry(node, struct work_atoms, node);
 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
 	}
+	if (root == &sched->atom_root) {
+		root = &sched->merged_atom_root;
+		goto again;
+	}
 }
 
 static int process_sched_wakeup_event(struct perf_tool *tool,
@@ -1542,6 +1552,59 @@ static void print_bad_events(struct perf_sched *sched)
 	}
 }
 
+static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct work_atoms *this;
+	const char *comm = thread__comm_str(data->thread), *this_comm;
+
+	while (*new) {
+		int cmp;
+
+		this = container_of(*new, struct work_atoms, node);
+		parent = *new;
+
+		this_comm = thread__comm_str(this->thread);
+		cmp = strcmp(comm, this_comm);
+		if (cmp > 0) {
+			new = &((*new)->rb_left);
+		} else if (cmp < 0) {
+			new = &((*new)->rb_right);
+		} else {
+			this->num_merged++;
+			this->total_runtime += data->total_runtime;
+			this->nb_atoms += data->nb_atoms;
+			this->total_lat += data->total_lat;
+			list_splice(&data->work_list, &this->work_list);
+			if (this->max_lat < data->max_lat) {
+				this->max_lat = data->max_lat;
+				this->max_lat_at = data->max_lat_at;
+			}
+			zfree(&data);
+			return;
+		}
+	}
+
+	data->num_merged++;
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void perf_sched__merge_lat(struct perf_sched *sched)
+{
+	struct work_atoms *data;
+	struct rb_node *node;
+
+	if (!sched->merge_by_comm)
+		return;
+
+	while ((node = rb_first(&sched->atom_root))) {
+		rb_erase(node, &sched->atom_root);
+		data = rb_entry(node, struct work_atoms, node);
+		__merge_work_atoms(&sched->merged_atom_root, data);
+	}
+}
+
 static int perf_sched__lat(struct perf_sched *sched)
 {
 	struct rb_node *next;
@@ -1551,6 +1614,7 @@ static int perf_sched__lat(struct perf_sched *sched)
 	if (perf_sched__read_events(sched))
 		return -1;
 
+	perf_sched__merge_lat(sched);
 	perf_sched__sort_lat(sched);
 
 	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
@@ -1702,6 +1766,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		.profile_cpu	      = -1,
 		.next_shortname1      = 'A',
 		.next_shortname2      = '0',
+		.merge_by_comm        = 0,
 	};
 	const struct option latency_options[] = {
 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
@@ -1712,6 +1777,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "CPU to profile on"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('m', "merge-by-comm", &sched.merge_by_comm,
+		    "merge like tasks together in the output"),
 	OPT_END()
 	};
 	const struct option replay_options[] = {
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] perf-sched: add option to merge like comms to lat output
  2015-05-21 21:25 [PATCH] perf-sched: add option to merge like comms to lat output Josef Bacik
@ 2015-05-22  6:27 ` Ingo Molnar
  0 siblings, 0 replies; 2+ messages in thread
From: Ingo Molnar @ 2015-05-22  6:27 UTC (permalink / raw)
  To: Josef Bacik; +Cc: a.p.zijlstra, mingo, acme, linux-kernel, kernel-team


* Josef Bacik <jbacik@fb.com> wrote:

> Sometimes when debugging large multi-threaded applications it is helpful to
> collate all of the latency numbers into one bulk record to get an idea of what
> is going on.  This patch does this by merging any entries that belong to the
> same comm into one entry and then spits out those totals.  I've also sligtly

s/sligtly
  slightly

> changed the output so you can see how many threads were merged in the
> processing.  Thanks,

Useful feature!

> +	OPT_BOOLEAN('m', "merge-by-comm", &sched.merge_by_comm,
> +		    "merge like tasks together in the output"),

Could you please make this the default behavior, and then add an 
option to print more detailed latencies?

We generally try to output general overviews first, and then make it 
easy to also output a more detailed view of information. That's how 
perf report, perf top and perf stat (-d, -dd, -ddd) works in general.

Also, please post a before/after sample output in the changelog.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-05-22  6:27 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-05-21 21:25 [PATCH] perf-sched: add option to merge like comms to lat output Josef Bacik
2015-05-22  6:27 ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox