Re: [PATCH v1 7/7] perf bench sync: Add latency histogram functionality

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Namhyung Kim <namhyung@kernel.org>
To: Yuzhuo Jing <yuzhuo@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>,
	Jiri Olsa <jolsa@kernel.org>, Ian Rogers <irogers@google.com>,
	Adrian Hunter <adrian.hunter@intel.com>,
	Liang Kan <kan.liang@linux.intel.com>,
	Yuzhuo Jing <yzj@umich.edu>,
	Andrea Parri <parri.andrea@gmail.com>,
	Palmer Dabbelt <palmer@rivosinc.com>,
	Charlie Jenkins <charlie@rivosinc.com>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Kumar Kartikeya Dwivedi <memxor@gmail.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Barret Rhoden <brho@google.com>,
	Alexandre Ghiti <alexghiti@rivosinc.com>,
	Guo Ren <guoren@kernel.org>,
	linux-kernel@vger.kernel.org, linux-perf-users@vger.kernel.org
Subject: Re: [PATCH v1 7/7] perf bench sync: Add latency histogram functionality
Date: Wed, 30 Jul 2025 22:24:53 -0700	[thread overview]
Message-ID: <aIr-JTyuJBKPJN8c@google.com> (raw)
In-Reply-To: <20250729022640.3134066-8-yuzhuo@google.com>

On Mon, Jul 28, 2025 at 07:26:40PM -0700, Yuzhuo Jing wrote:
> Add an option to print the histogram of lock acquire latencies (unit in
> TSCs).
> 
> Signed-off-by: Yuzhuo Jing <yuzhuo@google.com>
> ---
>  tools/perf/bench/sync.c | 97 ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 96 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/bench/sync.c b/tools/perf/bench/sync.c
> index 2685cb66584c..c85e9853c72a 100644
> --- a/tools/perf/bench/sync.c
> +++ b/tools/perf/bench/sync.c
> @@ -15,14 +15,19 @@
>  #include <sys/cdefs.h>
>  
>  #include "bench.h"
> +#include "../util/tsc.h"
>  
>  #include "include/qspinlock.h"
>  
>  #define NS 1000000000ull
>  #define CACHELINE_SIZE 64
>  
> +#define DEFAULT_HIST_INTERVAL 1000
> +
>  static unsigned int nthreads;
>  static unsigned long nspins = 10000ul;
> +static bool do_hist;
> +static u64 hist_interval = DEFAULT_HIST_INTERVAL;
>  
>  struct barrier_t;
>  
> @@ -45,6 +50,7 @@ struct worker {
>  	struct lock_ops *ops;
>  	struct barrier_t *barrier;
>  	u64 runtime;		// in nanoseconds
> +	u64 *lock_latency;	// in TSCs

Why TSC?  Is it for x86 only?

Thanks,
Namhyung

>  };
>  
>  static const struct option options[] = {
> @@ -52,6 +58,10 @@ static const struct option options[] = {
>  		"Specify number of threads (default: number of CPUs)."),
>  	OPT_ULONG('n',		"spins",	&nspins,
>  		"Number of lock acquire operations per thread (default: 10,000 times)."),
> +	OPT_BOOLEAN(0,		"hist",		&do_hist,
> +		"Print a histogram of lock acquire TSCs."),
> +	OPT_U64(0,	"hist-interval",	&hist_interval,
> +		"Histogram bucket size (default 1,000 TSCs)."),
>  	OPT_END()
>  };
>  
> @@ -109,6 +119,25 @@ static void lock_loop(const struct lock_ops *ops, unsigned long n)
>  	}
>  }
>  
> +/*
> + * A busy loop to acquire and release the given lock N times, and also collect
> + * all acquire latencies, for histogram use.  Note that the TSC operations
> + * latency itself is also included.
> + */
> +static void lock_loop_timing(const struct lock_ops *ops, unsigned long n, u64 *sample_buffer)
> +{
> +	unsigned long i;
> +	u64 t1, t2;
> +
> +	for (i = 0; i < n; ++i) {
> +		t1 = rdtsc();
> +		ops->lock(ops->data);
> +		t2 = rdtsc();
> +		ops->unlock(ops->data);
> +		sample_buffer[i] = t2 - t1;
> +	}
> +}
> +
>  /*
>   * Thread worker function.  Runs lock loop for N/5 times before and after
>   * the main timed loop.
> @@ -127,7 +156,10 @@ static void *sync_workerfn(void *args)
>  	lock_loop(worker->ops, nspins / 5);
>  
>  	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &starttime);
> -	lock_loop(worker->ops, nspins);
> +	if (worker->lock_latency)
> +		lock_loop_timing(worker->ops, nspins, worker->lock_latency);
> +	else
> +		lock_loop(worker->ops, nspins);
>  	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &endtime);
>  
>  	/* Tail loop (not counted) to keep the above loop contended. */
> @@ -139,6 +171,57 @@ static void *sync_workerfn(void *args)
>  	return NULL;
>  }
>  
> +/*
> + * Calculate and print a histogram.
> + */
> +static void print_histogram(struct worker *workers)
> +{
> +	u64 tsc_max = 0;
> +	u64 *buckets;
> +	unsigned long nbuckets;
> +
> +	if (hist_interval == 0)
> +		hist_interval = DEFAULT_HIST_INTERVAL;
> +
> +	printf("Lock acquire histogram:\n");
> +
> +	/* Calculate the max TSC value to get the number of buckets needed. */
> +	for (unsigned int i = 0; i < nthreads; ++i) {
> +		struct worker *w = workers + i;
> +
> +		for (unsigned long j = 0; j < nspins; ++j)
> +			tsc_max = max(w->lock_latency[j], tsc_max);
> +	}
> +	nbuckets = (tsc_max + hist_interval - 1) / hist_interval;
> +
> +	/* Allocate the actual bucket.  The bucket definition may be optimized
> +	 * if it is sparse.
> +	 */
> +	buckets = calloc(nbuckets, sizeof(*buckets));
> +	if (!buckets)
> +		err(EXIT_FAILURE, "calloc");
> +
> +	/* Iterate through all latencies again to fill the buckets. */
> +	for (unsigned int i = 0; i < nthreads; ++i) {
> +		struct worker *w = workers + i;
> +
> +		for (unsigned long j = 0; j < nspins; ++j) {
> +			u64 latency = w->lock_latency[j];
> +			++buckets[latency / hist_interval];
> +		}
> +	}
> +
> +	/* Print the histogram as a table. */
> +	printf("Bucket, Count\n");
> +	for (unsigned long i = 0; i < nbuckets; ++i) {
> +		if (buckets[i] == 0)
> +			continue;
> +		printf("%"PRIu64", %"PRIu64"\n", hist_interval * (i + 1), buckets[i]);
> +	}
> +
> +	free(buckets);
> +}
> +
>  /*
>   * Generic lock synchronization benchmark function.  Sets up threads and
>   * thread affinities.
> @@ -191,6 +274,12 @@ static int bench_sync_lock_generic(struct lock_ops *ops, int argc, const char **
>  		workers[i].barrier = &barrier;
>  		workers[i].ops = ops;
>  
> +		if (do_hist) {
> +			workers[i].lock_latency = calloc(nspins, sizeof(*workers[i].lock_latency));
> +			if (!workers[i].lock_latency)
> +				err(EXIT_FAILURE, "calloc");
> +		}
> +
>  		/* Set CPU affinity */
>  		pthread_attr_init(&thread_attr);
>  		CPU_ZERO_S(cpuset_size, cpuset);
> @@ -228,6 +317,12 @@ static int bench_sync_lock_generic(struct lock_ops *ops, int argc, const char **
>  	printf("Lock-unlock latency of %u threads: %"PRIu64".%"PRIu64" ns.\n",
>  			nthreads, avg_ns, avg_ns_dot);
>  
> +	/* Print histogram if requested. */
> +	if (do_hist)
> +		print_histogram(workers);
> +
> +	for (unsigned int i = 0; i < nthreads; ++i)
> +		free(workers[i].lock_latency);
>  	free(workers);
>  
>  	return 0;
> -- 
> 2.50.1.487.gc89ff58d15-goog
>

next prev parent reply	other threads:[~2025-07-31  5:24 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-29  2:26 [PATCH v1 0/7] perf bench: Add qspinlock benchmark Yuzhuo Jing
2025-07-29  2:26 ` [PATCH v1 1/7] tools: Import cmpxchg and xchg functions Yuzhuo Jing
2025-07-31  4:52   ` Namhyung Kim
2025-08-08  6:11   ` kernel test robot
2025-07-29  2:26 ` [PATCH v1 2/7] tools: Import smp_cond_load and atomic_cond_read Yuzhuo Jing
2025-07-29  2:26 ` [PATCH v1 3/7] tools: Partial import of prefetch.h Yuzhuo Jing
2025-07-31  4:54   ` Namhyung Kim
2025-07-29  2:26 ` [PATCH v1 4/7] tools: Implement userspace per-cpu Yuzhuo Jing
2025-07-31  5:07   ` Namhyung Kim
2025-07-29  2:26 ` [PATCH v1 5/7] perf bench: Import qspinlock from kernel Yuzhuo Jing
2025-07-29  2:26 ` [PATCH v1 6/7] perf bench: Add 'bench sync qspinlock' subcommand Yuzhuo Jing
2025-07-31  5:16   ` Namhyung Kim
2025-07-31 13:19     ` Yuzhuo Jing
2025-07-29  2:26 ` [PATCH v1 7/7] perf bench sync: Add latency histogram functionality Yuzhuo Jing
2025-07-31  5:18   ` Namhyung Kim
2025-07-31  5:24   ` Namhyung Kim [this message]
2025-07-31  4:51 ` [PATCH v1 0/7] perf bench: Add qspinlock benchmark Namhyung Kim
2025-08-04 14:28 ` Mark Rutland
2025-09-16 14:18   ` Peter Zijlstra
2025-09-16 17:00     ` Ian Rogers
2025-09-16 20:38       ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aIr-JTyuJBKPJN8c@google.com \
    --to=namhyung@kernel.org \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=alexghiti@rivosinc.com \
    --cc=ast@kernel.org \
    --cc=bigeasy@linutronix.de \
    --cc=brho@google.com \
    --cc=charlie@rivosinc.com \
    --cc=guoren@kernel.org \
    --cc=irogers@google.com \
    --cc=jolsa@kernel.org \
    --cc=kan.liang@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=memxor@gmail.com \
    --cc=mingo@redhat.com \
    --cc=palmer@rivosinc.com \
    --cc=parri.andrea@gmail.com \
    --cc=peterz@infradead.org \
    --cc=yuzhuo@google.com \
    --cc=yzj@umich.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.