All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Alex Bennée" <alex.bennee@linaro.org>
To: "Emilio G. Cota" <cota@braap.org>
Cc: qemu-devel@nongnu.org, Aurelien Jarno <aurelien@aurel32.net>,
	Peter Maydell <peter.maydell@linaro.org>,
	Laurent Vivier <laurent@vivier.eu>,
	Richard Henderson <richard.henderson@linaro.org>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Subject: Re: [Qemu-devel] [PATCH v1 01/14] tests: add fp-bench, a collection of simple floating-point microbenchmarks
Date: Tue, 27 Mar 2018 09:45:14 +0100	[thread overview]
Message-ID: <871sg5aov9.fsf@linaro.org> (raw)
In-Reply-To: <1521663109-32262-2-git-send-email-cota@braap.org>


Emilio G. Cota <cota@braap.org> writes:

> This will allow us to measure the performance impact of FP
> emulation optimizations.
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  tests/fp-bench.c       | 290 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/.gitignore       |   1 +
>  tests/Makefile.include |   3 +-
>  3 files changed, 293 insertions(+), 1 deletion(-)
>  create mode 100644 tests/fp-bench.c
>
> diff --git a/tests/fp-bench.c b/tests/fp-bench.c
> new file mode 100644
> index 0000000..a782093
> --- /dev/null
> +++ b/tests/fp-bench.c
> @@ -0,0 +1,290 @@
> +/*
> + * fp-bench.c - A collection of simple floating point microbenchmarks.
> + *
> + * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
> + *
> + * License: GNU GPL, version 2 or later.
> + *   See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "qemu/atomic.h"
> +
> +#include <math.h>
> +
> +#include <sys/time.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <time.h>
> +
> +/* amortize the computation of random inputs */
> +#define OPS_PER_ITER     (1000ULL)
> +
> +#define SEED_A 0xdeadfacedeadface
> +#define SEED_B 0xbadc0feebadc0fee
> +#define SEED_C 0xbeefdeadbeefdead
> +
> +enum op {
> +    OP_ADD,
> +    OP_SUB,
> +    OP_MUL,
> +    OP_DIV,
> +    OP_FMA,
> +    OP_SQRT,
> +};
> +
> +static const char * const op_names[] = {
> +    [OP_ADD] = "add",
> +    [OP_SUB] = "sub",
> +    [OP_MUL] = "mul",
> +    [OP_DIV] = "div",
> +    [OP_FMA] = "fma",
> +    [OP_SQRT] = "sqrt",
> +};
> +
> +static uint64_t n_ops = 10000000;
> +static enum op op;
> +static const char *precision = "float";
> +
> +static const char commands_string[] =
> +    " -n = number of floating point operations\n"
> +    " -o = floating point operation (add, sub, mul, div, fma, sqrt). Default: add\n"
> +    " -p = precision (float|single, double). Default: float";
> +
> +static void usage_complete(int argc, char *argv[])
> +{
> +    fprintf(stderr, "Usage: %s [options]\n", argv[0]);
> +    fprintf(stderr, "options:\n%s\n", commands_string);
> +    exit(-1);
> +}
> +
> +static void set_op(const char *name)
> +{
> +    int i;
> +
> +    for (i = 0; i < ARRAY_SIZE(op_names); i++) {
> +        if (strcmp(name, op_names[i]) == 0) {
> +            op = i;
> +            return;
> +        }
> +    }
> +    fprintf(stderr, "Unsupported op '%s'\n", name);
> +    exit(EXIT_FAILURE);
> +}
> +
> +static inline int64_t get_clock_realtime(void)
> +{
> +    struct timeval tv;
> +
> +    gettimeofday(&tv, NULL);
> +    return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
> +}
> +
> +/*
> + * From: https://en.wikipedia.org/wiki/Xorshift
> + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
> + * guaranteed to be >= INT_MAX).
> + */
> +static uint64_t xorshift64star(uint64_t x)
> +{
> +    x ^= x >> 12; /* a */
> +    x ^= x << 25; /* b */
> +    x ^= x >> 27; /* c */
> +    return x * UINT64_C(2685821657736338717);
> +}
> +
> +static inline bool u32_is_normal(uint32_t x)
> +{
> +    return ((x + 0x00800000) & 0x7fffffff) >= 0x01000000;
> +}
> +
> +static inline bool u64_is_normal(uint64_t x)
> +{
> +    return ((x + (1ULL << 52)) & -1ULL >> 1) >= 1ULL << 53;
> +}
> +
> +static inline float get_random_float(uint64_t *x)
> +{
> +    uint64_t r = *x;
> +    uint32_t r32;
> +
> +    do {
> +        r = xorshift64star(r);
> +    } while (!u32_is_normal(r));
> +    *x = r;
> +    r32 = r;
> +    return *(float *)&r32;
> +}
> +
> +static inline double get_random_double(uint64_t *x)
> +{
> +    uint64_t r = *x;
> +
> +    do {
> +        r = xorshift64star(r);
> +    } while (!u64_is_normal(r));
> +    *x = r;
> +    return *(double *)&r;
> +}
> +
> +/*
> + * Disable optimizations (e.g. "a OP b" outside of the inner loop) with
> + * volatile.
> + */
> +#define GEN_BENCH_1OPF(NAME, FUNC, PRECISION)                           \
> +    static void NAME(volatile PRECISION *res)                           \
> +    {                                                                   \
> +        uint64_t ra = SEED_A;                                           \
> +        uint64_t i, j;                                                  \
> +                                                                        \
> +        for (i = 0; i < n_ops; i += OPS_PER_ITER) {                     \
> +            volatile PRECISION a = glue(get_random_, PRECISION)(&ra);   \
> +                                                                        \
> +            for (j = 0; j < OPS_PER_ITER; j++) {                        \
> +                *res = FUNC(a);                                         \
> +            }                                                           \
> +        }                                                               \
> +    }
> +

Have you had a chance to look at if this will vectorise? I have a
similar benchmark which I compile with multiple options to test normal,
NEON/AdvSIMD and SVE enabled loops.

> +GEN_BENCH_1OPF(bench_float_sqrt, sqrtf, float)
> +GEN_BENCH_1OPF(bench_double_sqrt, sqrt, double)
> +#undef GEN_BENCH_1OPF
> +
> +#define GEN_BENCH_2OP(NAME, OP, PRECISION)                              \
> +    static void NAME(volatile PRECISION *res)                           \
> +    {                                                                   \
> +        uint64_t ra = SEED_A;                                           \
> +        uint64_t rb = SEED_B;                                           \
> +        uint64_t i, j;                                                  \
> +                                                                        \
> +        for (i = 0; i < n_ops; i += OPS_PER_ITER) {                     \
> +            volatile PRECISION a = glue(get_random_, PRECISION)(&ra);   \
> +            volatile PRECISION b = glue(get_random_, PRECISION)(&rb);   \
> +                                                                        \
> +            for (j = 0; j < OPS_PER_ITER; j++) {                        \
> +                *res = a OP b;                                          \
> +            }                                                           \
> +        }                                                               \
> +    }
> +
> +GEN_BENCH_2OP(bench_float_add, +, float)
> +GEN_BENCH_2OP(bench_float_sub, -, float)
> +GEN_BENCH_2OP(bench_float_mul, *, float)
> +GEN_BENCH_2OP(bench_float_div, /, float)
> +
> +GEN_BENCH_2OP(bench_double_add, +, double)
> +GEN_BENCH_2OP(bench_double_sub, -, double)
> +GEN_BENCH_2OP(bench_double_mul, *, double)
> +GEN_BENCH_2OP(bench_double_div, /, double)
> +
> +#define GEN_BENCH_3OPF(NAME, FUNC, PRECISION)                           \
> +    static void NAME(volatile PRECISION *res)                           \
> +    {                                                                   \
> +        uint64_t ra = SEED_A;                                           \
> +        uint64_t rb = SEED_B;                                           \
> +        uint64_t rc = SEED_C;                                           \
> +        uint64_t i, j;                                                  \
> +                                                                        \
> +        for (i = 0; i < n_ops; i += OPS_PER_ITER) {                     \
> +            volatile PRECISION a = glue(get_random_, PRECISION)(&ra);   \
> +            volatile PRECISION b = glue(get_random_, PRECISION)(&rb);   \
> +            volatile PRECISION c = glue(get_random_, PRECISION)(&rc);   \
> +                                                                        \
> +            for (j = 0; j < OPS_PER_ITER; j++) {                        \
> +                *res = FUNC(a, b, c);                                   \
> +            }                                                           \
> +        }                                                               \
> +    }
> +
> +GEN_BENCH_3OPF(bench_float_fma, fmaf, float)
> +GEN_BENCH_3OPF(bench_double_fma, fma, double)
> +#undef GEN_BENCH_3OPF
> +
> +static void parse_args(int argc, char *argv[])
> +{
> +    int c;
> +
> +    for (;;) {
> +        c = getopt(argc, argv, "n:ho:p:");
> +        if (c < 0) {
> +            break;
> +        }
> +        switch (c) {
> +        case 'h':
> +            usage_complete(argc, argv);
> +            exit(0);
> +        case 'n':
> +            n_ops = atoll(optarg);
> +            if (n_ops < OPS_PER_ITER) {
> +                n_ops = OPS_PER_ITER;
> +            }
> +            n_ops -= n_ops % OPS_PER_ITER;
> +            break;
> +        case 'o':
> +            set_op(optarg);
> +            break;
> +        case 'p':
> +            precision = optarg;
> +            if (strcmp(precision, "float") &&
> +                strcmp(precision, "single") &&
> +                strcmp(precision, "double")) {
> +                fprintf(stderr, "Unsupported precision '%s'\n", precision);
> +                exit(EXIT_FAILURE);

Supporting half-precision if the compiler does would also be useful here.

> +            }
> +            break;
> +        }
> +    }
> +}
> +
> +#define CALL_BENCH(OP, PRECISION, RESP)                 \
> +    do {                                                \
> +        switch (OP) {                                   \
> +        case OP_ADD:                                    \
> +            glue(glue(bench_, PRECISION), _add)(RESP);  \
> +            break;                                      \
> +        case OP_SUB:                                    \
> +            glue(glue(bench_, PRECISION), _sub)(RESP);  \
> +            break;                                      \
> +        case OP_MUL:                                    \
> +            glue(glue(bench_, PRECISION), _mul)(RESP);  \
> +            break;                                      \
> +        case OP_DIV:                                    \
> +            glue(glue(bench_, PRECISION), _div)(RESP);  \
> +            break;                                      \
> +        case OP_FMA:                                    \
> +            glue(glue(bench_, PRECISION), _fma)(RESP);  \
> +            break;                                      \
> +        case OP_SQRT:                                   \
> +            glue(glue(bench_, PRECISION), _sqrt)(RESP); \
> +            break;                                      \
> +        default:                                        \
> +            g_assert_not_reached();                     \
> +        }                                               \
> +    } while (0)
> +
> +int main(int argc, char *argv[])
> +{
> +    int64_t t0, t1;
> +    double resd;
> +
> +    parse_args(argc, argv);
> +    if (!strcmp(precision, "float") || !strcmp(precision, "single")) {
> +        float res;
> +        t0 = get_clock_realtime();
> +        CALL_BENCH(op, float, &res);
> +        t1 = get_clock_realtime();
> +        resd = res;
> +    } else if (!strcmp(precision, "double")) {
> +        t0 = get_clock_realtime();
> +        CALL_BENCH(op, double, &resd);
> +        t1 = get_clock_realtime();
> +    } else {
> +        g_assert_not_reached();
> +    }
> +    printf("%.2f MFlops\n", (double)n_ops / (t1 - t0) * 1e3);
> +    if (resd) {
> +        return 0;
> +    }
> +    return 0;
> +}
> diff --git a/tests/.gitignore b/tests/.gitignore
> index 18e58b2..df69175 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -12,6 +12,7 @@ check-qobject
>  check-qstring
>  check-qom-interface
>  check-qom-proplist
> +fp-bench
>  qht-bench
>  rcutorture
>  test-aio
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index ef9b88c..f6121ee 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -587,7 +587,7 @@ test-obj-y = tests/check-qnum.o tests/check-qstring.o tests/check-qdict.o \
>  	tests/rcutorture.o tests/test-rcu-list.o \
>  	tests/test-qdist.o tests/test-shift128.o \
>  	tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
> -	tests/atomic_add-bench.o
> +	tests/atomic_add-bench.o tests/fp-bench.o

Not sure why but "make check" didn't build this. I had to explicitly
"make tests/fp-bench". I guess along with atomic_add_bench though these
are explicitly guest facing tests so maybe we should move them once
tests/tcg is working again. I'll have another run at that this week.

>
>  $(test-obj-y): QEMU_INCLUDES += -Itests
>  QEMU_CFLAGS += -I$(SRC_PATH)/tests
> @@ -639,6 +639,7 @@ tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(tes
>  tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
>  tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
>  tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)
> +tests/fp-bench$(EXESUF): tests/fp-bench.o $(test-util-obj-y)
>
>  tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
>  	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\

Anyway for this version:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

--
Alex Bennée

  reply	other threads:[~2018-03-27  8:45 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-21 20:11 [Qemu-devel] [PATCH v1 00/14] fp-test + hostfloat Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 01/14] tests: add fp-bench, a collection of simple floating-point microbenchmarks Emilio G. Cota
2018-03-27  8:45   ` Alex Bennée [this message]
2018-03-27 17:21     ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 02/14] tests: add fp-test, a floating point test suite Emilio G. Cota
2018-03-27 10:13   ` Alex Bennée
2018-03-27 18:00     ` Emilio G. Cota
2018-03-28  9:51       ` Alex Bennée
2018-03-28 15:36         ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 03/14] softfloat: fix {min, max}nummag for same-abs-value inputs Emilio G. Cota
2018-03-27 10:15   ` Alex Bennée
2018-03-27 10:15   ` Alex Bennée
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 04/14] fp-test: add muladd variants Emilio G. Cota
2018-03-27 11:33   ` Alex Bennée
2018-03-27 18:03     ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 05/14] softfloat: add float32_is_normal and float64_is_normal Emilio G. Cota
2018-03-27 11:34   ` Alex Bennée
2018-03-27 18:05     ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 06/14] softfloat: add float32_is_denormal and float64_is_denormal Emilio G. Cota
2018-03-27 11:35   ` Alex Bennée
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 07/14] fpu: introduce hostfloat Emilio G. Cota
2018-03-21 20:41   ` Laurent Vivier
2018-03-21 21:45     ` Emilio G. Cota
2018-03-27 11:49   ` Alex Bennée
2018-03-27 18:16     ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction Emilio G. Cota
2018-03-22  5:05   ` Richard Henderson
2018-03-22  5:57     ` Emilio G. Cota
2018-03-22  6:41       ` Richard Henderson
2018-03-22 15:08         ` Emilio G. Cota
2018-03-22 15:12           ` Laurent Vivier
2018-03-22 19:57         ` Emilio G. Cota
2018-03-27 11:41           ` Alex Bennée
2018-03-27 18:08             ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 09/14] hostfloat: support float32/64 multiplication Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 10/14] hostfloat: support float32/64 division Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 11/14] hostfloat: support float32/64 fused multiply-add Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 12/14] hostfloat: support float32/64 square root Emilio G. Cota
2018-03-22  1:29   ` Alex Bennée
2018-03-22  4:02     ` Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 13/14] hostfloat: support float32/64 comparison Emilio G. Cota
2018-03-21 20:11 ` [Qemu-devel] [PATCH v1 14/14] hostfloat: support float32_to_float64 Emilio G. Cota
2018-03-21 20:36 ` [Qemu-devel] [PATCH v1 00/14] fp-test + hostfloat no-reply
2018-03-22  5:02 ` no-reply
2018-03-22  8:56 ` Alex Bennée
2018-03-22 15:28   ` Emilio G. Cota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=871sg5aov9.fsf@linaro.org \
    --to=alex.bennee@linaro.org \
    --cc=aurelien@aurel32.net \
    --cc=cota@braap.org \
    --cc=laurent@vivier.eu \
    --cc=mark.cave-ayland@ilande.co.uk \
    --cc=pbonzini@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.