From: "Philippe Mathieu-Daudé" <philmd@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>, qemu-devel@nongnu.org
Cc: qemu-arm@nongnu.org, qemu-stable@nongnu.org
Subject: Re: [PATCH] target/arm: Fix 32-bit SMOPA
Date: Tue, 5 Mar 2024 10:19:15 +0100 [thread overview]
Message-ID: <fa8f1d52-f88d-4107-abfa-fa10ebef511c@linaro.org> (raw)
In-Reply-To: <20240305023116.234256-1-richard.henderson@linaro.org>
Hi Richard,
On 5/3/24 03:31, Richard Henderson wrote:
> The while the 8-bit input elements are sequential in the input vector,
> the 32-bit output elements are not sequential in the output matrix.
> Do not attempt to compute 2 32-bit outputs at the same time.
>
> Cc: qemu-stable@nongnu.org
> Fixes: 23a5e3859f5 ("target/arm: Implement SME integer outer product")
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2083
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/tcg/sme_helper.c | 77 ++++++++++++++++++-------------
> tests/tcg/aarch64/sme-smopa-1.c | 47 +++++++++++++++++++
> tests/tcg/aarch64/sme-smopa-2.c | 54 ++++++++++++++++++++++
> tests/tcg/aarch64/Makefile.target | 2 +-
> 4 files changed, 147 insertions(+), 33 deletions(-)
> create mode 100644 tests/tcg/aarch64/sme-smopa-1.c
> create mode 100644 tests/tcg/aarch64/sme-smopa-2.c
>
> diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c
> index 904bfdac43..ef39eee48d 100644
> --- a/target/arm/tcg/sme_helper.c
> +++ b/target/arm/tcg/sme_helper.c
> @@ -1083,11 +1083,32 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
> }
> }
>
> -typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
> +typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
> +static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm,
> + uint8_t *pn, uint8_t *pm,
> + uint32_t desc, IMOPFn32 *fn)
> +{
> + intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
> + bool neg = simd_data(desc);
>
> -static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
> - uint8_t *pn, uint8_t *pm,
> - uint32_t desc, IMOPFn *fn)
> + for (row = 0; row < oprsz; ++row) {
> + uint8_t pa = pn[H1(row >> 1)] >> ((row & 1) * 4);
> + uint32_t *za_row = &za[H4(tile_vslice_index(row))];
> + uint32_t n = zn[H4(row)];
> +
> + for (col = 0; col < oprsz; ++col) {
> + uint8_t pb = pm[H1(col >> 1)] >> ((col & 1) * 4);
> + uint32_t *a = &za_row[col];
Shouldn't this be:
uint32_t *a = &za_row[H4(col)];
to work on big endian hosts?
> +
> + *a = fn(n, zm[H4(col)], *a, pa & pb & 0xf, neg);
> + }
> + }
> +}
> +
> +typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool);
> +static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm,
> + uint8_t *pn, uint8_t *pm,
> + uint32_t desc, IMOPFn64 *fn)
> {
> intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
> bool neg = simd_data(desc);
> @@ -1107,25 +1128,16 @@ static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
> }
>
> #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
> -static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
> +static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
> { \
> - uint32_t sum0 = 0, sum1 = 0; \
> + uint32_t sum = 0; \
> /* Apply P to N as a mask, making the inactive elements 0. */ \
> n &= expand_pred_b(p); \
> - sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
> - sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
> - sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
> - sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
> - sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
> - sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \
> - sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
> - sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \
> - if (neg) { \
> - sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \
> - } else { \
> - sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \
> - } \
> - return ((uint64_t)sum1 << 32) | sum0; \
> + sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
> + sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
> + sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
> + sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
> + return neg ? a - sum : a + sum; \
> }
next prev parent reply other threads:[~2024-03-05 9:19 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-05 2:31 [PATCH] target/arm: Fix 32-bit SMOPA Richard Henderson
2024-03-05 9:19 ` Philippe Mathieu-Daudé [this message]
2024-03-05 15:51 ` Richard Henderson
2024-03-05 16:01 ` Michael Tokarev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=fa8f1d52-f88d-4107-abfa-fa10ebef511c@linaro.org \
--to=philmd@linaro.org \
--cc=qemu-arm@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=qemu-stable@nongnu.org \
--cc=richard.henderson@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).