[PATCH] perf bench sched cpu-matrix benchmark

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Paul Turner <pjt@google.com>, Venki Pallipadi <venki@google.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	kamalesh@linux.vnet.ibm.com
Subject: [PATCH] perf bench sched cpu-matrix benchmark
Date: Fri, 4 Nov 2011 17:26:20 +0530	[thread overview]
Message-ID: <20111104115620.GA25704@linux.vnet.ibm.com> (raw)

Hi,

During the discussion of the nohz load balancer fix 
(https://lkml.org/lkml/2011/9/27/197), Ingo Molnar suggested
to replace the simple while1 loop, used for publishing the 
test results with cpu-matrix test. This patch adds cpu-matrix 
benchmark under perf bench.

TODO:
	- Provide option to run main thread at real-time priority.
	- Provide option for changing scheduling priority/policy
	  of worker threads.


perf bench: Add sched cpu-matrix benchmark

perf bench sched cpu-matrix benchmark is a matrix multiplication
workload, which can be replaced with the traditional while1 
cpu hog.

Example of usage:

% perf bench sched cpu-matrix 
# Running sched/cpu-matrix benchmark...

Multiplication of [20] x [20] matrix, using [1] threads
 Total time: 0.000170 [sec]

% perf bench sched cpu-matrix -s1k -t3 
# Running sched/cpu-matrix benchmark...
elapsed time = 0.100066 sec, progress = 27270868
elapsed time = 0.100079 sec, progress = 27299841
elapsed time = 0.100071 sec, progress = 27253513
elapsed time = 0.100073 sec, progress = 27272693
elapsed time = 0.100090 sec, progress = 10208415

Multiplication of [1024] x [1024] matrix, using [3] threads
 Total time: 0.500495 [sec]

% perf bench --format=simple sched cpu-matrix -s1k -t3 
 Total time: 0.454764 [sec]

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> 
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>

--
 tools/perf/Makefile           |    1 +
 tools/perf/bench/bench.h      |    2 +
 tools/perf/bench/cpu-matrix.c |  410 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin-bench.c    |    3 +
 4 files changed, 416 insertions(+), 0 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index b98e307..02bd562 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -363,6 +363,7 @@ ifeq ($(RAW_ARCH),x86_64)
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
 endif
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
+BUILTIN_OBJS += $(OUTPUT)bench/cpu-matrix.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
 BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index f7781c6..174465a 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -4,6 +4,8 @@
 extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
 extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
 extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used);
+extern int bench_cpu_matrix(int argc, const char **argv,
+				const char *prefix __used);
 
 #define BENCH_FORMAT_DEFAULT_STR	"default"
 #define BENCH_FORMAT_DEFAULT		0
diff --git a/tools/perf/bench/cpu-matrix.c b/tools/perf/bench/cpu-matrix.c
new file mode 100644
index 0000000..95bbdfa
--- /dev/null
+++ b/tools/perf/bench/cpu-matrix.c
@@ -0,0 +1,410 @@
+/*
+ * cpu matrix multiplication benchmark
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Authors: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *	    Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../builtin.h"
+#include "bench.h"
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <math.h>
+
+#define DEFAULT_ITERATIONS 1
+#define DEFAULT_MATRIX_SIZE 20
+#define DEFAULT_NUM_THREADS 1
+#define DEFAULT_SLEEP_MSEC 0
+
+static int iterations = DEFAULT_ITERATIONS;
+static const char *mat_size_str = "20";
+static unsigned int mat_size = DEFAULT_MATRIX_SIZE;
+static int num_threads = DEFAULT_NUM_THREADS;
+
+static int ready_count;
+static int sleep_msec = DEFAULT_SLEEP_MSEC;
+static pthread_mutex_t ready_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static inline void barf(const char *str)
+{
+	if (errno)
+		perror(str);
+	else
+		printf("%s\n", str);
+
+	exit(1);
+}
+
+static inline void *galloc(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (!ptr)
+		barf("malloc ");
+
+	return ptr;
+}
+
+
+static inline void populate_matrix(unsigned int *matrix, int size)
+{
+	unsigned int i;
+
+	for (i = 0; i < ((u64)size * (u64)size); i++)
+		*(matrix + i) = random() % 100;
+}
+
+static const struct option options[] = {
+	OPT_STRING('s', "matrix size", &mat_size_str, "20",
+			"Specify the size of the square matrix. "
+			"Available unit: K, M (upper and lower)"),
+	OPT_INTEGER('i', "iterations", &iterations,
+			"Specify number of iterations"),
+	OPT_INTEGER('t', "num_threads", &num_threads,
+			"Specify number of threads"),
+	OPT_INTEGER('p', "sleep_msec", &sleep_msec,
+			"Progress to be printed every P millseconds."),
+	OPT_END()
+};
+
+static const char *const bench_cpu_matrix_usage[] = {
+	"perf bench sched cpu_matrix <options>",
+	NULL
+};
+
+struct thread_work {
+	unsigned int *a, *b, *c;	/* Matrix A, B, C */
+	unsigned int progress;		/* Multiplication count */
+	unsigned int prev_progress;	/* Used for calculating delta */
+	int size;			/* No. of rows handled by a thread */
+	int start_row;			/* Row to start multiplication */
+	int done;			/* Indication of the thread job done */
+	int num_threads;		/* Number of threads */
+	int iter_count;			/* Number of iterations */
+};
+
+/*
+ * Returns the delta/difference between previous progress
+ * and current progress, where progress is summation of
+ * multiplications done by each thread.
+ */
+static u64 thread_progress(struct thread_work *work, int thread_count)
+{
+	int i;
+	u64 total_progress = 0;
+
+	for (i = 0; i < thread_count; i++) {
+		unsigned int progress, prev_progress, delta;
+
+		progress = work[i].progress;
+		prev_progress = work[i].prev_progress;
+		delta = progress - prev_progress;
+
+		work[i].prev_progress = progress;
+
+		total_progress += delta;
+	}
+
+	return total_progress;
+}
+
+/*
+ * Returns 1 if all the threads are done with multiplication
+ * else return 0.
+ */
+static inline int all_threads_done(struct thread_work *work, int thread_count)
+{
+	int i, done = 0;
+
+	for (i = 0; i < thread_count; ++i) {
+		if (work[i].done)
+			done++;
+	}
+
+	if (done >= thread_count)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Prints the progress of multipilcations done by all threads
+ * every sleep_msec
+ */
+static void print_progress(struct thread_work *work, int thread_count)
+{
+	struct timeval start, end, delta;
+	u64 curr_progress = 0;
+
+	if (!sleep_msec)
+		return;
+
+	delta.tv_sec = 0;
+	delta.tv_usec = 0;
+
+	gettimeofday(&start, NULL);
+
+	while (!all_threads_done(work, thread_count)) {
+		usleep(sleep_msec);
+		gettimeofday(&end, NULL);
+		curr_progress = thread_progress(work, thread_count);
+		timersub(&end, &start, &delta);
+		start = end;
+
+		printf("elapsed time = %lu.%06lu sec, progress = %lu\n",
+				delta.tv_sec, delta.tv_usec, curr_progress);
+		fflush(stdout);
+	}
+
+}
+
+/*
+ * Multiples single row X no. of columns.
+ */
+static void row_col_multiply(unsigned int *a, unsigned int *b, unsigned int *c,
+				int row_num, int col_num, int size,
+				struct thread_work *work)
+{
+	int i, j, k, sum = 0;
+
+	for (i = 0; i < size; ++i) {
+		j = *(a + (row_num * size) + i);
+		k = *(b + (i * size) + col_num);
+		sum += (j * k);
+		work->progress++;
+	}
+
+	*(c + (row_num * size) + col_num) = sum;
+}
+
+static void *thread_fn(void *arg)
+{
+	struct thread_work *work = arg;
+	int row, col, i, j, k;
+	unsigned int *a = work->a, *b = work->b, *c = work->c;
+
+	pthread_mutex_lock(&ready_lock);
+	ready_count++;
+	pthread_mutex_unlock(&ready_lock);
+
+	/*
+	 * Wait for all the threads to start up
+	 */
+	while (ready_count < work->num_threads)
+		cpu_relax();
+
+	/*
+	 * Iteration loop
+	 */
+	for (k = 0; k < work->iter_count; k++) {
+		row = work->start_row;
+
+		/*
+		 * Rows the thread is suppose to work on
+		 */
+		for (i = 0; i < work->size; i++, row++) {
+			/*
+			 * Reset the column to first column
+			 */
+			col = 0;
+
+			for (j = 0; j < work->size; j++, col++)
+				row_col_multiply(a, b, c, row, col,
+						 work->size, work);
+		}
+	}
+
+	work->done = 1;
+
+	return NULL;
+}
+
+/*
+ * Core function to create threads and assign work to them.
+ */
+static void matrix_multiply(unsigned int *a, unsigned int *b, unsigned int *c,
+				int matrix_size, int iter_count,
+				int thread_count)
+{
+	int i;
+	unsigned int per_thread_work, rem, row_idx = 0;
+	struct thread_work *work_arr;
+	pthread_t *thread_ids;
+
+	assert(thread_count > 0);
+
+	work_arr = galloc(thread_count * sizeof(struct thread_work));
+	thread_ids = galloc(thread_count * sizeof(pthread_t));
+
+	per_thread_work = matrix_size / thread_count;
+	rem = matrix_size;
+
+	for (i = 0; i < thread_count; ++i) {
+		int num_rows, rc;
+
+		/*
+		 * If the thread is the last thread, assign it all the
+		 * remaining rows
+		 */
+		if (i == (thread_count - 1))
+			num_rows = rem;
+		else
+			num_rows = (rem > per_thread_work) ?
+						 per_thread_work : rem;
+
+		rem -= num_rows;
+		work_arr[i].a = a;
+		work_arr[i].b = b;
+		work_arr[i].c = c;
+		work_arr[i].iter_count = iter_count;
+		work_arr[i].size = num_rows;
+		work_arr[i].start_row = row_idx;
+		work_arr[i].progress = 0;
+		work_arr[i].prev_progress = 0;
+		work_arr[i].done = 0;
+		work_arr[i].num_threads = thread_count;
+
+		row_idx += num_rows;
+
+		rc = pthread_create(&thread_ids[i], NULL, thread_fn,
+							 &work_arr[i]);
+		if (rc != 0)
+			barf("pthread_create ");
+	}
+
+	assert(!rem);
+	assert(row_idx == mat_size);
+
+	print_progress(work_arr, thread_count);
+
+	for (i = 0; i < thread_count; ++i)
+		pthread_join(thread_ids[i], NULL);
+
+	free(work_arr);
+	free(thread_ids);
+}
+
+#define K 1024LL
+
+static int parse_mat_size(const char *str)
+{
+	unsigned int i;
+	int unit = 1;
+	s64 length = -1;
+
+	if (!isdigit(str[0]))
+		return -1;
+
+	for (i = 1; i < strlen(str); i++) {
+		switch (str[i]) {
+		case 'k':
+		case 'K':
+			unit = (unit == 1 ? K : -1);
+			break;
+		case 'm':
+		case 'M':
+			unit = (unit == 1 ? (K * K) : -1);
+			break;
+		default:
+			if (!isdigit(str[i]))
+				goto out_err;
+			break;
+		}
+	}
+
+	if (unit > 0)
+		length = atoll(str) * unit;
+
+	if (length > INT_MAX)
+		length = -1;
+
+out_err:
+	return (int)length;
+}
+
+
+int bench_cpu_matrix(int argc, const char **argv,
+			const char *prefix __used)
+{
+
+	unsigned int *mat_a, *mat_b, *mat_c;
+	struct timeval start, stop, diff;
+	size_t alloc_size;
+
+	errno = 0;
+
+	parse_options(argc, argv, options, bench_cpu_matrix_usage, 0);
+	mat_size = parse_mat_size(mat_size_str);
+	if ((int)mat_size <= 0)
+		barf("Invalid size of matrix ");
+
+	if (iterations <= 0)
+		barf("Invalid loop(s) of iterations ");
+
+	if (num_threads <= 0)
+		barf("Invalid number of threads ");
+
+	alloc_size = (u64)mat_size * (u64)mat_size * sizeof(unsigned int);
+
+	mat_a = galloc(alloc_size);
+	mat_b = galloc(alloc_size);
+	mat_c = galloc(alloc_size);
+
+	sleep_msec *= 1000;
+
+	populate_matrix(mat_a, mat_size);
+	populate_matrix(mat_b, mat_size);
+
+	gettimeofday(&start, NULL);
+
+	matrix_multiply(mat_a, mat_b, mat_c, mat_size, iterations, num_threads);
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	free(mat_a);
+	free(mat_b);
+	free(mat_c);
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("\nMultiplication of [%d] x [%d] matrix,"
+			" using [%d] threads\n",
+				 mat_size, mat_size, num_threads);
+		printf(" %s: %lu.%06lu [sec]\n", "Total time",
+				diff.tv_sec, diff.tv_usec);
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		printf(" %s: %lu.%06lu [sec]\n", "Total time",
+				diff.tv_sec, diff.tv_usec);
+		break;
+	default:
+		barf("Unknown benchmark format");
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index fcb9626..df84428 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -42,6 +42,9 @@ static struct bench_suite sched_suites[] = {
 	{ "pipe",
 	  "Flood of communication over pipe() between two processes",
 	  bench_sched_pipe      },
+	{ "cpu-matrix",
+	  "Benchmark to run cpu matrix multiplication",
+	  bench_cpu_matrix      },
 	suite_all,
 	{ NULL,
 	  NULL,

next             reply	other threads:[~2011-11-04 11:56 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-11-04 11:56 Kamalesh Babulal [this message]
2011-11-04 12:50 ` [PATCH] perf bench sched cpu-matrix benchmark Srivatsa Vaddagiri
2011-11-10 14:52 ` Ingo Molnar
2011-11-14  8:57   ` [PATCH V2] " Kamalesh Babulal

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b98e307 dfblob:02bd562 dfblob:f7781c6 dfblob:174465a
dfblob:95bbdfa dfblob:fcb9626 dfblob:df84428 )
 OR (
bs:"[PATCH] perf bench sched cpu-matrix benchmark" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20111104115620.GA25704@linux.vnet.ibm.com \
    --to=kamalesh@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=pjt@google.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@linux.vnet.ibm.com \
    --cc=venki@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.