[PATCH] perf tooling: Simplify 'perf bench syscall'

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Ingo Molnar <mingo@kernel.org>
To: riel@redhat.com
Cc: linux-kernel@vger.kernel.org, fweisbec@gmail.com,
	tglx@linutronix.de, luto@amacapital.net, peterz@infradead.org,
	clark@redhat.com, Arnaldo Carvalho de Melo <acme@infradead.org>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'
Date: Mon, 1 Feb 2016 08:48:49 +0100	[thread overview]
Message-ID: <20160201074849.GA9129@gmail.com> (raw)
In-Reply-To: <20160201074156.GA27156@gmail.com>


* Ingo Molnar <mingo@kernel.org> wrote:

> [...]
> 
> I kept the process, threading and memory allocation bits of numa.c, just in case 
> we need them to measure more complex syscalls. Maybe we could keep the threading 
> bits and remove the memory allocation parameters, to simplify the benchmark?

So the patch below removes NUMA details: convergence measurement and memory access 
pattern details. This reduces the linecount by about 30%. Should be combined with 
the previous patch I suspect.

Thanks,

	Ingo

==================>
>From a992aecebe12a195ffa74e09fcbe6b48db4430e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 1 Feb 2016 08:46:39 +0100
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'

Remove NUMA legacies.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/perf/bench/syscall.c | 316 +--------------------------------------------
 1 file changed, 5 insertions(+), 311 deletions(-)

diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c
index 5a4ef02176d1..fabac462bde5 100644
--- a/tools/perf/bench/syscall.c
+++ b/tools/perf/bench/syscall.c
@@ -81,11 +81,6 @@ struct params {
 	double			mb_thread;
 
 	/* Access patterns to the working set: */
-	bool			data_reads;
-	bool			data_writes;
-	bool			data_backwards;
-	bool			data_zero_memset;
-	bool			data_rand_walk;
 	u32			nr_loops;
 	u32			nr_secs;
 	u32			sleep_usecs;
@@ -108,10 +103,6 @@ struct params {
 	int			nr_tasks;
 	bool			show_quiet;
 
-	bool			show_convergence;
-	bool			measure_convergence;
-
-	int			perturb_secs;
 	int			nr_cpus;
 	int			nr_nodes;
 
@@ -139,8 +130,6 @@ struct global_info {
 
 	struct thread_data	*threads;
 
-	/* Convergence latency measurement: */
-	bool			all_converged;
 	bool			stop_work;
 
 	int			print_once;
@@ -168,23 +157,13 @@ static const struct option options[] = {
 	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run (default: 5 secs)"),
 	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"),
 
-	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"),
-	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"),
-	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"),
-	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
-	OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,	"access the data with random (32bit LFSR) walk"),
-
-
 	OPT_BOOLEAN('z', "init_zero"	, &p0.init_zero,	"bzero the initial allocations"),
 	OPT_BOOLEAN('I', "init_random"	, &p0.init_random,	"randomize the contents of the initial allocations"),
 	OPT_BOOLEAN('0', "init_cpu0"	, &p0.init_cpu0,	"do the initial allocations on CPU#0"),
-	OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,	"perturb thread 0/0 every X secs, to test convergence stability"),
 
 	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"),
 	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"),
 	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
-	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
-	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
 	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"quiet mode"),
 	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
 
@@ -208,32 +187,6 @@ static const char * const syscall_usage[] = {
 	NULL
 };
 
-static cpu_set_t bind_to_cpu(int target_cpu)
-{
-	cpu_set_t orig_mask, mask;
-	int ret;
-
-	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-	BUG_ON(ret);
-
-	CPU_ZERO(&mask);
-
-	if (target_cpu == -1) {
-		int cpu;
-
-		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &mask);
-	} else {
-		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
-		CPU_SET(target_cpu, &mask);
-	}
-
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
-
-	return orig_mask;
-}
-
 static cpu_set_t bind_to_node(int target_node)
 {
 	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
@@ -699,222 +652,11 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
 	prctl(0, bytes_worked);
 }
 
-#define MAX_NR_NODES	64
-
-/*
- * Count the number of nodes a process's threads
- * are spread out on.
- *
- * A count of 1 means that the process is compressed
- * to a single node. A count of g->p.nr_nodes means it's
- * spread out on the whole system.
- */
-static int count_process_nodes(int process_nr)
-{
-	char node_present[MAX_NR_NODES] = { 0, };
-	int nodes;
-	int n, t;
-
-	for (t = 0; t < g->p.nr_threads; t++) {
-		struct thread_data *td;
-		int task_nr;
-		int node;
-
-		task_nr = process_nr*g->p.nr_threads + t;
-		td = g->threads + task_nr;
-
-		node = numa_node_of_cpu(td->curr_cpu);
-		if (node < 0) /* curr_cpu was likely still -1 */
-			return 0;
-
-		node_present[node] = 1;
-	}
-
-	nodes = 0;
-
-	for (n = 0; n < MAX_NR_NODES; n++)
-		nodes += node_present[n];
-
-	return nodes;
-}
-
-/*
- * Count the number of distinct process-threads a node contains.
- *
- * A count of 1 means that the node contains only a single
- * process. If all nodes on the system contain at most one
- * process then we are well-converged.
- */
-static int count_node_processes(int node)
-{
-	int processes = 0;
-	int t, p;
-
-	for (p = 0; p < g->p.nr_proc; p++) {
-		for (t = 0; t < g->p.nr_threads; t++) {
-			struct thread_data *td;
-			int task_nr;
-			int n;
-
-			task_nr = p*g->p.nr_threads + t;
-			td = g->threads + task_nr;
-
-			n = numa_node_of_cpu(td->curr_cpu);
-			if (n == node) {
-				processes++;
-				break;
-			}
-		}
-	}
-
-	return processes;
-}
-
-static void calc_convergence_compression(int *strong)
-{
-	unsigned int nodes_min, nodes_max;
-	int p;
-
-	nodes_min = -1;
-	nodes_max =  0;
-
-	for (p = 0; p < g->p.nr_proc; p++) {
-		unsigned int nodes = count_process_nodes(p);
-
-		if (!nodes) {
-			*strong = 0;
-			return;
-		}
-
-		nodes_min = min(nodes, nodes_min);
-		nodes_max = max(nodes, nodes_max);
-	}
-
-	/* Strong convergence: all threads compress on a single node: */
-	if (nodes_min == 1 && nodes_max == 1) {
-		*strong = 1;
-	} else {
-		*strong = 0;
-		tprintf(" {%d-%d}", nodes_min, nodes_max);
-	}
-}
-
-static void calc_convergence(double runtime_ns_max, double *convergence)
-{
-	unsigned int loops_done_min, loops_done_max;
-	int process_groups;
-	int nodes[MAX_NR_NODES];
-	int distance;
-	int nr_min;
-	int nr_max;
-	int strong;
-	int sum;
-	int nr;
-	int node;
-	int cpu;
-	int t;
-
-	if (!g->p.show_convergence && !g->p.measure_convergence)
-		return;
-
-	for (node = 0; node < g->p.nr_nodes; node++)
-		nodes[node] = 0;
-
-	loops_done_min = -1;
-	loops_done_max = 0;
-
-	for (t = 0; t < g->p.nr_tasks; t++) {
-		struct thread_data *td = g->threads + t;
-		unsigned int loops_done;
-
-		cpu = td->curr_cpu;
-
-		/* Not all threads have written it yet: */
-		if (cpu < 0)
-			continue;
-
-		node = numa_node_of_cpu(cpu);
-
-		nodes[node]++;
-
-		loops_done = td->loops_done;
-		loops_done_min = min(loops_done, loops_done_min);
-		loops_done_max = max(loops_done, loops_done_max);
-	}
-
-	nr_max = 0;
-	nr_min = g->p.nr_tasks;
-	sum = 0;
-
-	for (node = 0; node < g->p.nr_nodes; node++) {
-		nr = nodes[node];
-		nr_min = min(nr, nr_min);
-		nr_max = max(nr, nr_max);
-		sum += nr;
-	}
-	BUG_ON(nr_min > nr_max);
-
-	BUG_ON(sum > g->p.nr_tasks);
-
-	if (0 && (sum < g->p.nr_tasks))
-		return;
-
-	/*
-	 * Count the number of distinct process groups present
-	 * on nodes - when we are converged this will decrease
-	 * to g->p.nr_proc:
-	 */
-	process_groups = 0;
-
-	for (node = 0; node < g->p.nr_nodes; node++) {
-		int processes = count_node_processes(node);
-
-		nr = nodes[node];
-		tprintf(" %2d/%-2d", nr, processes);
-
-		process_groups += processes;
-	}
-
-	distance = nr_max - nr_min;
-
-	tprintf(" [%2d/%-2d]", distance, process_groups);
-
-	tprintf(" l:%3d-%-3d (%3d)",
-		loops_done_min, loops_done_max, loops_done_max-loops_done_min);
-
-	if (loops_done_min && loops_done_max) {
-		double skew = 1.0 - (double)loops_done_min/loops_done_max;
-
-		tprintf(" [%4.1f%%]", skew * 100.0);
-	}
-
-	calc_convergence_compression(&strong);
-
-	if (strong && process_groups == g->p.nr_proc) {
-		if (!*convergence) {
-			*convergence = runtime_ns_max;
-			tprintf(" (%6.1fs converged)\n", *convergence/1e9);
-			if (g->p.measure_convergence) {
-				g->all_converged = true;
-				g->stop_work = true;
-			}
-		}
-	} else {
-		if (*convergence) {
-			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
-			*convergence = 0;
-		}
-		tprintf("\n");
-	}
-}
-
-static void show_summary(double runtime_ns_max, int l, double *convergence)
+static void show_summary(double runtime_ns_max, int l)
 {
 	tprintf("\r #  %5.1f%%  [%.1f mins]",
 		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
 
-	calc_convergence(runtime_ns_max, convergence);
-
 	if (g->p.show_details >= 0)
 		fflush(stdout);
 }
@@ -925,11 +667,9 @@ static void *worker_thread(void *__tdata)
 	struct timeval start0, start, stop, diff;
 	int process_nr = td->process_nr;
 	int thread_nr = td->thread_nr;
-	unsigned long last_perturbance;
 	int task_nr = td->task_nr;
 	int details = g->p.show_details;
-	int first_task, last_task;
-	double convergence = 0;
+	int last_task;
 	u64 val = td->val;
 	double runtime_ns_max;
 	u8 *global_data;
@@ -955,10 +695,6 @@ static void *worker_thread(void *__tdata)
 	if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
 		last_task = 1;
 
-	first_task = 0;
-	if (process_nr == 0 && thread_nr == 0)
-		first_task = 1;
-
 	if (details >= 2) {
 		printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
 			process_nr, thread_nr, global_data, process_data, thread_data);
@@ -983,7 +719,6 @@ static void *worker_thread(void *__tdata)
 	gettimeofday(&start0, NULL);
 
 	start = stop = start0;
-	last_perturbance = start.tv_sec;
 
 	for (l = 0; l < g->p.nr_loops; l++) {
 		start = stop;
@@ -1015,7 +750,7 @@ static void *worker_thread(void *__tdata)
 		update_curr_cpu(task_nr, work_done);
 		bytes_done += work_done;
 
-		if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
+		if (details < 0 && !g->p.nr_secs)
 			continue;
 
 		td->loops_done = l;
@@ -1035,37 +770,6 @@ static void *worker_thread(void *__tdata)
 		if (start.tv_sec == stop.tv_sec)
 			continue;
 
-		/*
-		 * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
-		 * by migrating to CPU#0:
-		 */
-		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
-			cpu_set_t orig_mask;
-			int target_cpu;
-			int this_cpu;
-
-			last_perturbance = stop.tv_sec;
-
-			/*
-			 * Depending on where we are running, move into
-			 * the other half of the system, to create some
-			 * real disturbance:
-			 */
-			this_cpu = g->threads[task_nr].curr_cpu;
-			if (this_cpu < g->p.nr_cpus/2)
-				target_cpu = g->p.nr_cpus-1;
-			else
-				target_cpu = 0;
-
-			orig_mask = bind_to_cpu(target_cpu);
-
-			/* Here we are running on the target CPU already */
-			if (details >= 1)
-				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
-
-			bind_to_cpumask(orig_mask);
-		}
-
 		if (details >= 3) {
 			timersub(&stop, &start, &diff);
 			runtime_ns_max = diff.tv_sec * 1000000000;
@@ -1084,7 +788,7 @@ static void *worker_thread(void *__tdata)
 		runtime_ns_max = diff.tv_sec * 1000000000ULL;
 		runtime_ns_max += diff.tv_usec * 1000ULL;
 
-		show_summary(runtime_ns_max, l, &convergence);
+		show_summary(runtime_ns_max, l);
 	}
 
 	gettimeofday(&stop, NULL);
@@ -1226,8 +930,7 @@ static int init(void)
 
 	g->p.nr_nodes = numa_max_node() + 1;
 
-	/* char array in count_process_nodes(): */
-	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+	BUG_ON(g->p.nr_nodes < 0);
 
 	if (g->p.show_quiet && !g->p.show_details)
 		g->p.show_details = -1;
@@ -1427,11 +1130,6 @@ static int __bench_syscall(const char *name)
 	bytes = g->bytes_done;
 	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
 
-	if (g->p.measure_convergence) {
-		print_res(name, runtime_sec_max,
-			"secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
-	}
-
 	print_res(name, runtime_sec_max,
 		"secs,", "runtime-max/thread",	"secs slowest (max) thread-runtime");
 
@@ -1517,10 +1215,6 @@ static void init_params(struct params *p, const char *name, int argc, const char
 	/* Initialize nonzero defaults: */
 
 	p->serialize_startup		= 1;
-	p->data_reads			= true;
-	p->data_writes			= true;
-	p->data_backwards		= true;
-	p->data_rand_walk		= true;
 	p->nr_loops			= 10000000;
 	p->init_random			= true;
 	p->mb_global_str		= "1";

next prev parent reply	other threads:[~2016-02-01  7:49 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-01  2:12 [PATCH 0/4 v3] sched,time: reduce nohz_full syscall overhead 40% riel
2016-02-01  2:12 ` [PATCH 1/4] sched,time: remove non-power-of-two divides from __acct_update_integrals riel
2016-02-01  4:46   ` kbuild test robot
2016-02-01  8:37   ` Thomas Gleixner
2016-02-01  9:22     ` Peter Zijlstra
2016-02-01  9:31       ` Thomas Gleixner
2016-02-01 13:44       ` Rik van Riel
2016-02-01 13:51         ` Peter Zijlstra
2016-02-01  2:12 ` [PATCH 2/4] acct,time: change indentation in __acct_update_integrals riel
2016-02-01  2:12 ` [PATCH 3/4] time,acct: drop irq save & restore from __acct_update_integrals riel
2016-02-01  9:28   ` Peter Zijlstra
2016-02-01 19:22     ` Rik van Riel
2016-02-01  2:12 ` [PATCH 4/4] sched,time: only call account_{user,sys,guest,idle}_time once a jiffy riel
2016-02-01  9:29   ` Peter Zijlstra
2016-02-01 19:23     ` Rik van Riel
2016-02-01  7:41 ` [PATCH] perf tooling: Add 'perf bench syscall' benchmark Ingo Molnar
2016-02-01  7:48   ` Ingo Molnar [this message]
2016-02-01 15:41   ` Andy Lutomirski
2016-02-03 10:22     ` Ingo Molnar
2016-06-20 18:00       ` [PATCH] perf: add 'perf bench syscall' Josh Poimboeuf
2016-06-20 19:16         ` Andy Lutomirski
2016-06-21 14:55           ` Josh Poimboeuf
2016-06-21 16:31             ` Andy Lutomirski

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:5a4ef02176d dfblob:fabac462bde )
 OR (
bs:"perf tooling: Simplify 'perf bench syscall'" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160201074849.GA9129@gmail.com \
    --to=mingo@kernel.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=acme@infradead.org \
    --cc=clark@redhat.com \
    --cc=fweisbec@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=peterz@infradead.org \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.