All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ankur Arora <ankur.a.arora@oracle.com>
To: Namhyung Kim <namhyung@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>,
	Ian Rogers <irogers@google.com>,
	James Clark <james.clark@linaro.org>,
	Jiri Olsa <jolsa@kernel.org>,
	Adrian Hunter <adrian.hunter@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	LKML <linux-kernel@vger.kernel.org>,
	linux-perf-users@vger.kernel.org,
	Ankur Arora <ankur.a.arora@oracle.com>
Subject: Re: [PATCH v2 2/2] perf bench: Add -t/--threads option to perf bench mem mmap
Date: Wed, 25 Feb 2026 23:02:59 -0800	[thread overview]
Message-ID: <878qcgx9zw.fsf@oracle.com> (raw)
In-Reply-To: <20260219004417.188434-2-namhyung@kernel.org>


Namhyung Kim <namhyung@kernel.org> writes:

> So that it can measure overhead of mmap_lock and/or per-VMA lock
> contention.
>
>   $ perf bench mem mmap -f demand -l 1000 -t 1
>   # Running 'mem/mmap' benchmark:
>   # function 'demand' (Demand loaded mmap())
>   # Copying 1MB bytes ...
>
>          2.786858 GB/sec
>
>   $ perf bench mem mmap -f demand -l 1000 -t 2
>   # Running 'mem/mmap' benchmark:
>   # function 'demand' (Demand loaded mmap())
>   # Copying 1MB bytes ...
>
>          1.624468 GB/sec/thread   ( +-   0.30% )
>
>   $ perf bench mem mmap -f demand -l 1000 -t 3
>   # Running 'mem/mmap' benchmark:
>   # function 'demand' (Demand loaded mmap())
>   # Copying 1MB bytes ...
>
>          1.493068 GB/sec/thread   ( +-   0.15% )
>
>   $ perf bench mem mmap -f demand -l 1000 -t 4
>   # Running 'mem/mmap' benchmark:
>   # function 'demand' (Demand loaded mmap())
>   # Copying 1MB bytes ...
>
>          1.006087 GB/sec/thread   ( +-   0.41% )
>
> Cc: Ankur Arora <ankur.a.arora@oracle.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Reviewed-by: Ankur Arora <ankur.a.arora@oracle.com>

> ---
> v2)
>  * correct timeval2double calculation
>  * add "/thread" notation when -t > 1  (James)
>  * add stddev when -t > 1  (Ankur)
>
>  tools/perf/Documentation/perf-bench.txt |   4 +
>  tools/perf/bench/mem-functions.c        | 109 +++++++++++++++++++-----
>  2 files changed, 92 insertions(+), 21 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
> index 1160224cb718392d..c5913cf59c988421 100644
> --- a/tools/perf/Documentation/perf-bench.txt
> +++ b/tools/perf/Documentation/perf-bench.txt
> @@ -274,6 +274,10 @@ Repeat mmap() invocation this number of times.
>  --cycles::
>  Use perf's cpu-cycles event instead of gettimeofday syscall.
>
> +-t::
> +--threads=<NUM>::
> +Create multiple threads to call mmap/munmap concurrently.
> +
>  SUITES FOR 'numa'
>  ~~~~~~~~~~~~~~~~~
>  *mem*::
> diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
> index 676c8d18f4c2e259..8e04ae69f8a85c5e 100644
> --- a/tools/perf/bench/mem-functions.c
> +++ b/tools/perf/bench/mem-functions.c
> @@ -7,13 +7,14 @@
>   * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
>   */
>
> -#include "debug.h"
> +#include "bench.h"
>  #include "../perf-sys.h"
>  #include <subcmd/parse-options.h>
> -#include "../util/header.h"
> -#include "../util/cloexec.h"
> -#include "../util/string2.h"
> -#include "bench.h"
> +#include "util/cloexec.h"
> +#include "util/debug.h"
> +#include "util/header.h"
> +#include "util/stat.h"
> +#include "util/string2.h"
>  #include "mem-memcpy-arch.h"
>  #include "mem-memset-arch.h"
>
> @@ -26,6 +27,7 @@
>  #include <errno.h>
>  #include <linux/time64.h>
>  #include <linux/log2.h>
> +#include <pthread.h>
>
>  #define K 1024
>
> @@ -41,6 +43,7 @@ static unsigned int	nr_loops	= 1;
>  static bool		use_cycles;
>  static int		cycles_fd;
>  static unsigned int	seed;
> +static unsigned int	nr_threads	= 1;
>
>  static const struct option bench_common_options[] = {
>  	OPT_STRING('s', "size", &size_str, "1MB",
> @@ -121,6 +124,8 @@ static struct perf_event_attr cycle_attr = {
>  	.config		= PERF_COUNT_HW_CPU_CYCLES
>  };
>
> +static struct stats stats;
> +
>  static int init_cycles(void)
>  {
>  	cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
> @@ -174,18 +179,18 @@ static void clock_accum(union bench_clock *a, union bench_clock *b)
>
>  static double timeval2double(struct timeval *ts)
>  {
> -	return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC;
> +	return ((double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC) / nr_threads;
>  }
>
>  #define print_bps(x) do {						\
>  		if (x < K)						\
> -			printf(" %14lf bytes/sec\n", x);		\
> +			printf(" %14lf bytes/sec", x);			\
>  		else if (x < K * K)					\
> -			printf(" %14lfd KB/sec\n", x / K);		\
> +			printf(" %14lfd KB/sec", x / K);		\
>  		else if (x < K * K * K)					\
> -			printf(" %14lf MB/sec\n", x / K / K);		\
> +			printf(" %14lf MB/sec", x / K / K);		\
>  		else							\
> -			printf(" %14lf GB/sec\n", x / K / K / K);	\
> +			printf(" %14lf GB/sec", x / K / K / K);	\
>  	} while (0)
>
>  static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p,
> @@ -197,6 +202,7 @@ static void __bench_mem_function(struct bench_mem_info *info, struct bench_param
>  	void *src = NULL, *dst = NULL;
>
>  	memset(&rt, 0, sizeof(rt));
> +	init_stats(&stats);
>  	printf("# function '%s' (%s)\n", r->name, r->desc);
>
>  	if (r->fn.init && r->fn.init(info, p, &src, &dst))
> @@ -211,11 +217,16 @@ static void __bench_mem_function(struct bench_mem_info *info, struct bench_param
>  	switch (bench_format) {
>  	case BENCH_FORMAT_DEFAULT:
>  		if (use_cycles) {
> -			printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total);
> +			printf(" %14lf cycles/byte", (double)rt.cycles/(double)p->size_total);
>  		} else {
>  			result_bps = (double)p->size_total/timeval2double(&rt.tv);
>  			print_bps(result_bps);
>  		}
> +		if (nr_threads > 1) {
> +			printf("/thread\t( +- %6.2f%% )",
> +			       rel_stddev_stats(stddev_stats(&stats), avg_stats(&stats)));
> +		}
> +		printf("\n");
>  		break;
>
>  	case BENCH_FORMAT_SIMPLE:
> @@ -495,16 +506,27 @@ static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, boo
>  	}
>  }
>
> -static int do_mmap(const struct function *r, struct bench_params *p,
> -		  void *src __maybe_unused, void *dst __maybe_unused,
> -		  union bench_clock *accum)
> +struct mmap_data {
> +	pthread_t id;
> +	const struct function *func;
> +	struct bench_params *params;
> +	union bench_clock result;
> +	unsigned int seed;
> +	int error;
> +};
> +
> +static void *do_mmap_thread(void *arg)
>  {
> +	struct mmap_data *data = arg;
> +	const struct function *r = data->func;
> +	struct bench_params *p = data->params;
>  	union bench_clock start, end, diff;
>  	mmap_op_t fn = r->fn.mmap_op;
>  	bool populate = strcmp(r->name, "populate") == 0;
> +	void *dst;
>
> -	if (p->seed)
> -		srand(p->seed);
> +	if (data->seed)
> +		srand(data->seed);
>
>  	for (unsigned int i = 0; i < p->nr_loops; i++) {
>  		clock_get(&start);
> @@ -515,16 +537,59 @@ static int do_mmap(const struct function *r, struct bench_params *p,
>  		fn(dst, p->size, p->page_shift, p->seed);
>  		clock_get(&end);
>  		diff = clock_diff(&start, &end);
> -		clock_accum(accum, &diff);
> +		clock_accum(&data->result, &diff);
>
>  		bench_munmap(dst, p->size);
>  	}
>
> -	return 0;
> +	return data;
>  out:
> -	printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
> -			p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
> -	return -1;
> +	data->error = -ENOMEM;
> +	return NULL;
> +}
> +
> +static int do_mmap(const struct function *r, struct bench_params *p,
> +		  void *src __maybe_unused, void *dst __maybe_unused,
> +		  union bench_clock *accum)
> +{
> +	struct mmap_data *data;
> +	int error = 0;
> +
> +	data = calloc(nr_threads, sizeof(*data));
> +	if (!data) {
> +		printf("# Failed to allocate thread resources\n");
> +		return -1;
> +	}
> +
> +	for (unsigned int i = 0; i < nr_threads; i++) {
> +		data[i].func = r;
> +		data[i].params = p;
> +		if (p->seed)
> +			data[i].seed = p->seed + i;
> +
> +		if (pthread_create(&data[i].id, NULL, do_mmap_thread, &data[i]) < 0)
> +			data[i].error = -errno;
> +	}
> +
> +	for (unsigned int i = 0; i < nr_threads; i++) {
> +		union bench_clock *t = &data[i].result;
> +
> +		pthread_join(data[i].id, NULL);
> +
> +		clock_accum(accum, t);
> +		if (use_cycles)
> +			update_stats(&stats, t->cycles);
> +		else
> +			update_stats(&stats, t->tv.tv_sec * 1e6 + t->tv.tv_usec);
> +		error |= data[i].error;
> +	}
> +	free(data);
> +
> +	if (error) {
> +		printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
> +		       p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
> +	}
> +	return error ? -1 : 0;
>  }
>
>  static const char * const bench_mem_mmap_usage[] = {
> @@ -549,6 +614,8 @@ int bench_mem_mmap(int argc, const char **argv)
>  	static const struct option bench_mmap_options[] = {
>  		OPT_UINTEGER('r', "randomize", &seed,
>  			    "Seed to randomize page access offset."),
> +		OPT_UINTEGER('t', "threads", &nr_threads,
> +			    "Number of threads to run concurrently (default: 1)."),
>  		OPT_PARENT(bench_common_options),
>  		OPT_END()
>  	};


--
ankur

  parent reply	other threads:[~2026-02-26  7:03 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-19  0:44 [PATCH v2 1/2] perf bench: Fix initialization of union Namhyung Kim
2026-02-19  0:44 ` [PATCH v2 2/2] perf bench: Add -t/--threads option to perf bench mem mmap Namhyung Kim
2026-02-19 10:43   ` James Clark
2026-02-26  7:02   ` Ankur Arora [this message]
2026-02-27 21:42   ` Namhyung Kim
2026-02-19 10:36 ` [PATCH v2 1/2] perf bench: Fix initialization of union James Clark
2026-02-19 10:38   ` James Clark
2026-02-19 15:02     ` Leo Yan
2026-02-19 19:00       ` Namhyung Kim
2026-02-23 19:12         ` Namhyung Kim
2026-02-24 12:20           ` Leo Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=878qcgx9zw.fsf@oracle.com \
    --to=ankur.a.arora@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=irogers@google.com \
    --cc=james.clark@linaro.org \
    --cc=jolsa@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.