[RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
@ 2023-05-23 13:11 Alex Bennée
  2023-05-23 13:57 ` BALATON Zoltan
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Alex Bennée @ 2023-05-23 13:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: Alex Bennée, Richard Henderson, BALATON Zoltan,
	Aurelien Jarno, Peter Maydell

Balton discovered that asserts for the extract/deposit calls had a
significant impact on a lame benchmark on qemu-ppc. Replicating with:

  ./qemu-ppc64 ~/lsrc/tests/lame.git-svn/builds/ppc64/frontend/lame \
    -h pts-trondheim-3.wav pts-trondheim-3.mp3

showed up the pack/unpack routines not eliding the assert checks as it
should have done causing them to prominently figure in the profile:

  11.44%  qemu-ppc64  qemu-ppc64               [.] unpack_raw64.isra.0
  11.03%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
   8.26%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
   6.75%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
   5.34%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
   4.75%  qemu-ppc64  qemu-ppc64               [.] pack_raw64.isra.0
   4.38%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize
   3.62%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical

After this patch the same test runs 31 seconds faster with a profile
where the generated code dominates more:

+   14.12%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619420
+   13.30%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000616850
+   12.58%    12.19%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
+   10.62%     0.00%  qemu-ppc64  [unknown]                [.] 0x000000400061bf70
+    9.91%     9.73%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
+    7.84%     7.82%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
+    6.47%     5.78%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize.constprop.0
+    6.46%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000620130
+    6.42%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619400
+    6.17%     6.04%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
+    5.85%     0.00%  qemu-ppc64  [unknown]                [.] 0x00000040006167e0
+    5.74%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000b693fcffffd3
+    5.45%     4.78%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <ec9cfe5a-d5f2-466d-34dc-c35817e7e010@linaro.org>
[AJB: Patchified rth's suggestion]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: BALATON Zoltan <balaton@eik.bme.hu>
---
 fpu/softfloat.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 108f9cb224..42e6c188b4 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -593,27 +593,27 @@ static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
     };
 }
 
-static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
+static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
 {
     unpack_raw64(p, &float16_params, f);
 }
 
-static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
+static void QEMU_FLATTEN bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 {
     unpack_raw64(p, &bfloat16_params, f);
 }
 
-static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
+static void QEMU_FLATTEN float32_unpack_raw(FloatParts64 *p, float32 f)
 {
     unpack_raw64(p, &float32_params, f);
 }
 
-static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
+static void QEMU_FLATTEN float64_unpack_raw(FloatParts64 *p, float64 f)
 {
     unpack_raw64(p, &float64_params, f);
 }
 
-static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
+static void QEMU_FLATTEN floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
 {
     *p = (FloatParts128) {
         .cls = float_class_unclassified,
@@ -623,7 +623,7 @@ static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
     };
 }
 
-static void float128_unpack_raw(FloatParts128 *p, float128 f)
+static void QEMU_FLATTEN float128_unpack_raw(FloatParts128 *p, float128 f)
 {
     const int f_size = float128_params.frac_size - 64;
     const int e_size = float128_params.exp_size;
@@ -650,27 +650,27 @@ static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
     return ret;
 }
 
-static inline float16 float16_pack_raw(const FloatParts64 *p)
+static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
 {
     return make_float16(pack_raw64(p, &float16_params));
 }
 
-static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
+static bfloat16 QEMU_FLATTEN bfloat16_pack_raw(const FloatParts64 *p)
 {
     return pack_raw64(p, &bfloat16_params);
 }
 
-static inline float32 float32_pack_raw(const FloatParts64 *p)
+static float32 QEMU_FLATTEN float32_pack_raw(const FloatParts64 *p)
 {
     return make_float32(pack_raw64(p, &float32_params));
 }
 
-static inline float64 float64_pack_raw(const FloatParts64 *p)
+static float64 QEMU_FLATTEN float64_pack_raw(const FloatParts64 *p)
 {
     return make_float64(pack_raw64(p, &float64_params));
 }
 
-static float128 float128_pack_raw(const FloatParts128 *p)
+static float128 QEMU_FLATTEN float128_pack_raw(const FloatParts128 *p)
 {
     const int f_size = float128_params.frac_size - 64;
     const int e_size = float128_params.exp_size;
-- 
2.39.2



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 13:11 [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining Alex Bennée
@ 2023-05-23 13:57 ` BALATON Zoltan
  2023-05-23 14:33   ` Richard Henderson
                     ` (2 more replies)
  2023-05-23 14:18 ` Philippe Mathieu-Daudé
  2023-05-23 15:34 ` Richard Henderson
  2 siblings, 3 replies; 14+ messages in thread
From: BALATON Zoltan @ 2023-05-23 13:57 UTC (permalink / raw)
  To: Alex Bennée
  Cc: qemu-devel, Richard Henderson, Aurelien Jarno, Peter Maydell

[-- Attachment #1: Type: text/plain, Size: 7031 bytes --]

On Tue, 23 May 2023, Alex Bennée wrote:
> Balton discovered that asserts for the extract/deposit calls had a

Missing an a in my name and my given name is Zoltan. (First name and last 
name is in the other way in Hungarian.) Maybe just add a Reported-by 
instead of here if you want to record it.

> significant impact on a lame benchmark on qemu-ppc. Replicating with:
>
>  ./qemu-ppc64 ~/lsrc/tests/lame.git-svn/builds/ppc64/frontend/lame \
>    -h pts-trondheim-3.wav pts-trondheim-3.mp3
>
> showed up the pack/unpack routines not eliding the assert checks as it
> should have done causing them to prominently figure in the profile:
>
>  11.44%  qemu-ppc64  qemu-ppc64               [.] unpack_raw64.isra.0
>  11.03%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
>   8.26%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
>   6.75%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
>   5.34%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
>   4.75%  qemu-ppc64  qemu-ppc64               [.] pack_raw64.isra.0
>   4.38%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize
>   3.62%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
>
> After this patch the same test runs 31 seconds faster with a profile
> where the generated code dominates more:
>
> +   14.12%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619420
> +   13.30%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000616850
> +   12.58%    12.19%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
> +   10.62%     0.00%  qemu-ppc64  [unknown]                [.] 0x000000400061bf70
> +    9.91%     9.73%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
> +    7.84%     7.82%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
> +    6.47%     5.78%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize.constprop.0
> +    6.46%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000620130
> +    6.42%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619400
> +    6.17%     6.04%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
> +    5.85%     0.00%  qemu-ppc64  [unknown]                [.] 0x00000040006167e0
> +    5.74%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000b693fcffffd3
> +    5.45%     4.78%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
>
> Suggested-by: Richard Henderson <richard.henderson@linaro.org>
> Message-Id: <ec9cfe5a-d5f2-466d-34dc-c35817e7e010@linaro.org>
> [AJB: Patchified rth's suggestion]
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: BALATON Zoltan <balaton@eik.bme.hu>

Replace Cc: with
Tested-by: BALATON Zoltan <balaton@eik.bme.hu>

This solves the softfloat related usages, the rest probably are lower 
overhead, I could not measure any more improvement with removing asserts 
on top of this patch. I still have these functions high in my profiling 
result:

children  self    command          symbol
11.40%    10.86%  qemu-system-ppc  helper_compute_fprf_float64
11.25%     0.61%  qemu-system-ppc  helper_fmadds
10.01%     3.23%  qemu-system-ppc  float64r32_round_pack_canonical
  8.59%     1.80%  qemu-system-ppc  helper_float_check_status
  8.34%     7.23%  qemu-system-ppc  parts64_muladd
  8.16%     0.67%  qemu-system-ppc  helper_fmuls
  8.08%     0.43%  qemu-system-ppc  parts64_uncanon
  7.49%     1.78%  qemu-system-ppc  float64r32_mul
  7.32%     7.32%  qemu-system-ppc  parts64_uncanon_normal
  6.48%     0.52%  qemu-system-ppc  helper_fadds
  6.31%     6.31%  qemu-system-ppc  do_float_check_status
  5.99%     1.14%  qemu-system-ppc  float64r32_add

Any idea on those?

Unrelated to this patch I also started to see random crashes with a DSI on 
a dcbz instruction now which did not happen before (or not frequently 
enough for me to notice). I did not bisect that as it happens randomly but 
I wonder if it could be related to recent unaligned access changes or some 
other TCG change? Any idea what to check?

Regards,
BALATON Zoltan

> ---
> fpu/softfloat.c | 22 +++++++++++-----------
> 1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 108f9cb224..42e6c188b4 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -593,27 +593,27 @@ static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
>     };
> }
>
> -static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
> +static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
> {
>     unpack_raw64(p, &float16_params, f);
> }
>
> -static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
> +static void QEMU_FLATTEN bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
> {
>     unpack_raw64(p, &bfloat16_params, f);
> }
>
> -static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
> +static void QEMU_FLATTEN float32_unpack_raw(FloatParts64 *p, float32 f)
> {
>     unpack_raw64(p, &float32_params, f);
> }
>
> -static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
> +static void QEMU_FLATTEN float64_unpack_raw(FloatParts64 *p, float64 f)
> {
>     unpack_raw64(p, &float64_params, f);
> }
>
> -static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
> +static void QEMU_FLATTEN floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
> {
>     *p = (FloatParts128) {
>         .cls = float_class_unclassified,
> @@ -623,7 +623,7 @@ static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
>     };
> }
>
> -static void float128_unpack_raw(FloatParts128 *p, float128 f)
> +static void QEMU_FLATTEN float128_unpack_raw(FloatParts128 *p, float128 f)
> {
>     const int f_size = float128_params.frac_size - 64;
>     const int e_size = float128_params.exp_size;
> @@ -650,27 +650,27 @@ static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
>     return ret;
> }
>
> -static inline float16 float16_pack_raw(const FloatParts64 *p)
> +static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
> {
>     return make_float16(pack_raw64(p, &float16_params));
> }
>
> -static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
> +static bfloat16 QEMU_FLATTEN bfloat16_pack_raw(const FloatParts64 *p)
> {
>     return pack_raw64(p, &bfloat16_params);
> }
>
> -static inline float32 float32_pack_raw(const FloatParts64 *p)
> +static float32 QEMU_FLATTEN float32_pack_raw(const FloatParts64 *p)
> {
>     return make_float32(pack_raw64(p, &float32_params));
> }
>
> -static inline float64 float64_pack_raw(const FloatParts64 *p)
> +static float64 QEMU_FLATTEN float64_pack_raw(const FloatParts64 *p)
> {
>     return make_float64(pack_raw64(p, &float64_params));
> }
>
> -static float128 float128_pack_raw(const FloatParts128 *p)
> +static float128 QEMU_FLATTEN float128_pack_raw(const FloatParts128 *p)
> {
>     const int f_size = float128_params.frac_size - 64;
>     const int e_size = float128_params.exp_size;
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 13:11 [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining Alex Bennée
  2023-05-23 13:57 ` BALATON Zoltan
@ 2023-05-23 14:18 ` Philippe Mathieu-Daudé
  2023-05-23 15:34 ` Richard Henderson
  2 siblings, 0 replies; 14+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-05-23 14:18 UTC (permalink / raw)
  To: Alex Bennée, qemu-devel
  Cc: Richard Henderson, BALATON Zoltan, Aurelien Jarno, Peter Maydell

On 23/5/23 15:11, Alex Bennée wrote:
> Balton discovered that asserts for the extract/deposit calls had a

Zoltan

> significant impact on a lame benchmark on qemu-ppc. Replicating with:
> 
>    ./qemu-ppc64 ~/lsrc/tests/lame.git-svn/builds/ppc64/frontend/lame \
>      -h pts-trondheim-3.wav pts-trondheim-3.mp3
> 
> showed up the pack/unpack routines not eliding the assert checks as it
> should have done causing them to prominently figure in the profile:

Worth mentioning "even using --disable-debug"?

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>

>    11.44%  qemu-ppc64  qemu-ppc64               [.] unpack_raw64.isra.0
>    11.03%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
>     8.26%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
>     6.75%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
>     5.34%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
>     4.75%  qemu-ppc64  qemu-ppc64               [.] pack_raw64.isra.0
>     4.38%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize
>     3.62%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
> 
> After this patch the same test runs 31 seconds faster with a profile
> where the generated code dominates more:
> 
> +   14.12%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619420
> +   13.30%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000616850
> +   12.58%    12.19%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
> +   10.62%     0.00%  qemu-ppc64  [unknown]                [.] 0x000000400061bf70
> +    9.91%     9.73%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
> +    7.84%     7.82%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
> +    6.47%     5.78%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize.constprop.0
> +    6.46%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000620130
> +    6.42%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619400
> +    6.17%     6.04%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
> +    5.85%     0.00%  qemu-ppc64  [unknown]                [.] 0x00000040006167e0
> +    5.74%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000b693fcffffd3
> +    5.45%     4.78%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
> 
> Suggested-by: Richard Henderson <richard.henderson@linaro.org>
> Message-Id: <ec9cfe5a-d5f2-466d-34dc-c35817e7e010@linaro.org>
> [AJB: Patchified rth's suggestion]
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: BALATON Zoltan <balaton@eik.bme.hu>
> ---
>   fpu/softfloat.c | 22 +++++++++++-----------
>   1 file changed, 11 insertions(+), 11 deletions(-)




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 13:57 ` BALATON Zoltan
@ 2023-05-23 14:33   ` Richard Henderson
  2023-05-23 17:51     ` BALATON Zoltan
                       ` (2 more replies)
  2023-05-26 11:56   ` BALATON Zoltan
  2023-06-22 20:55   ` BALATON Zoltan
  2 siblings, 3 replies; 14+ messages in thread
From: Richard Henderson @ 2023-05-23 14:33 UTC (permalink / raw)
  To: BALATON Zoltan, Alex Bennée
  Cc: qemu-devel, Aurelien Jarno, Peter Maydell

On 5/23/23 06:57, BALATON Zoltan wrote:
> This solves the softfloat related usages, the rest probably are lower overhead, I could 
> not measure any more improvement with removing asserts on top of this patch. I still have 
> these functions high in my profiling result:
> 
> children  self    command          symbol
> 11.40%    10.86%  qemu-system-ppc  helper_compute_fprf_float64

You might need to dig in with perf here, but my first guess is

#define COMPUTE_CLASS(tp)                                      \
static int tp##_classify(tp arg)                               \
{                                                              \
     int ret = tp##_is_neg(arg) * is_neg;                       \
     if (unlikely(tp##_is_any_nan(arg))) {                      \
         float_status dummy = { };  /* snan_bit_is_one = 0 */   \
         ret |= (tp##_is_signaling_nan(arg, &dummy)             \
                 ? is_snan : is_qnan);                          \
     } else if (unlikely(tp##_is_infinity(arg))) {              \
         ret |= is_inf;                                         \
     } else if (tp##_is_zero(arg)) {                            \
         ret |= is_zero;                                        \
     } else if (tp##_is_zero_or_denormal(arg)) {                \
         ret |= is_denormal;                                    \
     } else {                                                   \
         ret |= is_normal;                                      \
     }                                                          \
     return ret;                                                \
}

The tests are poorly ordered, testing many unlikely things before the most likely thing 
(normal).  A better ordering would be

     if (likely(tp##_is_normal(arg))) {
     } else if (tp##_is_zero(arg)) {
     } else if (tp##_is_zero_or_denormal(arg)) {
     } else if (tp##_is_infinity(arg)) {
     } else {
         // nan case
     }

Secondly, we compute the classify bitmask, and then deconstruct the mask again in 
set_fprf_from_class.  Since we don't use the classify bitmask for anything else, better 
would be to compute the fprf value directly in the if-ladder.


> 11.25%     0.61%  qemu-system-ppc  helper_fmadds

This is unsurprising, and nothing much that can be done.
All of the work is in muladd doing the arithmetic.

> Unrelated to this patch I also started to see random crashes with a DSI on a dcbz 
> instruction now which did not happen before (or not frequently enough for me to notice). I 
> did not bisect that as it happens randomly but I wonder if it could be related to recent 
> unaligned access changes or some other TCG change? Any idea what to check?

No idea.


r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 13:11 [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining Alex Bennée
  2023-05-23 13:57 ` BALATON Zoltan
  2023-05-23 14:18 ` Philippe Mathieu-Daudé
@ 2023-05-23 15:34 ` Richard Henderson
  2 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-05-23 15:34 UTC (permalink / raw)
  To: Alex Bennée, qemu-devel
  Cc: BALATON Zoltan, Aurelien Jarno, Peter Maydell

On 5/23/23 06:11, Alex Bennée wrote:
> Balton discovered that asserts for the extract/deposit calls had a
> significant impact on a lame benchmark on qemu-ppc. Replicating with:
> 
>    ./qemu-ppc64 ~/lsrc/tests/lame.git-svn/builds/ppc64/frontend/lame \
>      -h pts-trondheim-3.wav pts-trondheim-3.mp3
> 
> showed up the pack/unpack routines not eliding the assert checks as it
> should have done causing them to prominently figure in the profile:
> 
>    11.44%  qemu-ppc64  qemu-ppc64               [.] unpack_raw64.isra.0
>    11.03%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
>     8.26%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
>     6.75%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
>     5.34%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
>     4.75%  qemu-ppc64  qemu-ppc64               [.] pack_raw64.isra.0
>     4.38%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize
>     3.62%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
> 
> After this patch the same test runs 31 seconds faster with a profile
> where the generated code dominates more:
> 
> +   14.12%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619420
> +   13.30%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000616850
> +   12.58%    12.19%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
> +   10.62%     0.00%  qemu-ppc64  [unknown]                [.] 0x000000400061bf70
> +    9.91%     9.73%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
> +    7.84%     7.82%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
> +    6.47%     5.78%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize.constprop.0
> +    6.46%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000620130
> +    6.42%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619400
> +    6.17%     6.04%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
> +    5.85%     0.00%  qemu-ppc64  [unknown]                [.] 0x00000040006167e0
> +    5.74%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000b693fcffffd3
> +    5.45%     4.78%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
> 
> Suggested-by: Richard Henderson<richard.henderson@linaro.org>
> Message-Id:<ec9cfe5a-d5f2-466d-34dc-c35817e7e010@linaro.org>
> [AJB: Patchified rth's suggestion]
> Signed-off-by: Alex Bennée<alex.bennee@linaro.org>
> Cc: BALATON Zoltan<balaton@eik.bme.hu>
> ---
>   fpu/softfloat.c | 22 +++++++++++-----------
>   1 file changed, 11 insertions(+), 11 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 14:33   ` Richard Henderson
@ 2023-05-23 17:51     ` BALATON Zoltan
  2023-05-25 13:22     ` Paolo Bonzini
  2023-05-25 13:30     ` Paolo Bonzini
  2 siblings, 0 replies; 14+ messages in thread
From: BALATON Zoltan @ 2023-05-23 17:51 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Alex Bennée, qemu-devel, Aurelien Jarno, Peter Maydell

[-- Attachment #1: Type: text/plain, Size: 3077 bytes --]

On Tue, 23 May 2023, Richard Henderson wrote:
> On 5/23/23 06:57, BALATON Zoltan wrote:
>> This solves the softfloat related usages, the rest probably are lower 
>> overhead, I could not measure any more improvement with removing asserts on 
>> top of this patch. I still have these functions high in my profiling 
>> result:
>> 
>> children  self    command          symbol
>> 11.40%    10.86%  qemu-system-ppc  helper_compute_fprf_float64
>
> You might need to dig in with perf here, but my first guess is
>
> #define COMPUTE_CLASS(tp)                                      \
> static int tp##_classify(tp arg)                               \
> {                                                              \
>    int ret = tp##_is_neg(arg) * is_neg;                       \
>    if (unlikely(tp##_is_any_nan(arg))) {                      \
>        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
>        ret |= (tp##_is_signaling_nan(arg, &dummy)             \
>                ? is_snan : is_qnan);                          \
>    } else if (unlikely(tp##_is_infinity(arg))) {              \
>        ret |= is_inf;                                         \
>    } else if (tp##_is_zero(arg)) {                            \
>        ret |= is_zero;                                        \
>    } else if (tp##_is_zero_or_denormal(arg)) {                \
>        ret |= is_denormal;                                    \
>    } else {                                                   \
>        ret |= is_normal;                                      \
>    }                                                          \
>    return ret;                                                \
> }
>
> The tests are poorly ordered, testing many unlikely things before the most 
> likely thing (normal).  A better ordering would be
>
>    if (likely(tp##_is_normal(arg))) {
>    } else if (tp##_is_zero(arg)) {
>    } else if (tp##_is_zero_or_denormal(arg)) {
>    } else if (tp##_is_infinity(arg)) {
>    } else {
>        // nan case
>    }
>
> Secondly, we compute the classify bitmask, and then deconstruct the mask 
> again in set_fprf_from_class.  Since we don't use the classify bitmask for 
> anything else, better would be to compute the fprf value directly in the 
> if-ladder.

Thanks for the guidance. Alex, will you make a patch of this too or should 
I try to figure out how to do that? I'm not sure I understood everything 
in the above but read only once.

Regards,
BALATON Zoltan

>> 11.25%     0.61%  qemu-system-ppc  helper_fmadds
>
> This is unsurprising, and nothing much that can be done.
> All of the work is in muladd doing the arithmetic.
>
>> Unrelated to this patch I also started to see random crashes with a DSI on 
>> a dcbz instruction now which did not happen before (or not frequently 
>> enough for me to notice). I did not bisect that as it happens randomly but 
>> I wonder if it could be related to recent unaligned access changes or some 
>> other TCG change? Any idea what to check?
>
> No idea.
>
>
> r~
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 14:33   ` Richard Henderson
  2023-05-23 17:51     ` BALATON Zoltan
@ 2023-05-25 13:22     ` Paolo Bonzini
  2023-05-25 13:30       ` Richard Henderson
  2023-05-25 23:15       ` BALATON Zoltan
  2023-05-25 13:30     ` Paolo Bonzini
  2 siblings, 2 replies; 14+ messages in thread
From: Paolo Bonzini @ 2023-05-25 13:22 UTC (permalink / raw)
  To: Richard Henderson, BALATON Zoltan, Alex Bennée
  Cc: qemu-devel, Aurelien Jarno, Peter Maydell

On 5/23/23 16:33, Richard Henderson wrote:
> 
> 
> The tests are poorly ordered, testing many unlikely things before the 
> most likely thing (normal).  A better ordering would be
> 
>      if (likely(tp##_is_normal(arg))) {
>      } else if (tp##_is_zero(arg)) {
>      } else if (tp##_is_zero_or_denormal(arg)) {
>      } else if (tp##_is_infinity(arg)) {
>      } else {
>          // nan case
>      }
> 
> Secondly, we compute the classify bitmask, and then deconstruct the mask 
> again in set_fprf_from_class.  Since we don't use the classify bitmask 
> for anything else, better would be to compute the fprf value directly in 
> the if-ladder.

So something like this:

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index a66e16c2128c..daed97ca178e 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -141,62 +141,30 @@ static inline int ppc_float64_get_unbiased_exp(float64 f)
      return ((f >> 52) & 0x7FF) - 1023;
  }
  
-/* Classify a floating-point number.  */
-enum {
-    is_normal   = 1,
-    is_zero     = 2,
-    is_denormal = 4,
-    is_inf      = 8,
-    is_qnan     = 16,
-    is_snan     = 32,
-    is_neg      = 64,
-};
-
-#define COMPUTE_CLASS(tp)                                      \
-static int tp##_classify(tp arg)                               \
-{                                                              \
-    int ret = tp##_is_neg(arg) * is_neg;                       \
-    if (unlikely(tp##_is_any_nan(arg))) {                      \
-        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
-        ret |= (tp##_is_signaling_nan(arg, &dummy)             \
-                ? is_snan : is_qnan);                          \
-    } else if (unlikely(tp##_is_infinity(arg))) {              \
-        ret |= is_inf;                                         \
-    } else if (tp##_is_zero(arg)) {                            \
-        ret |= is_zero;                                        \
-    } else if (tp##_is_zero_or_denormal(arg)) {                \
-        ret |= is_denormal;                                    \
-    } else {                                                   \
-        ret |= is_normal;                                      \
-    }                                                          \
-    return ret;                                                \
-}
-
-COMPUTE_CLASS(float16)
-COMPUTE_CLASS(float32)
-COMPUTE_CLASS(float64)
-COMPUTE_CLASS(float128)
-
-static void set_fprf_from_class(CPUPPCState *env, int class)
+static void set_fprf(CPUPPCState *env, uint8_t ret)
  {
-    static const uint8_t fprf[6][2] = {
-        { 0x04, 0x08 },  /* normalized */
-        { 0x02, 0x12 },  /* zero */
-        { 0x14, 0x18 },  /* denormalized */
-        { 0x05, 0x09 },  /* infinity */
-        { 0x11, 0x11 },  /* qnan */
-        { 0x00, 0x00 },  /* snan -- flags are undefined */
-    };
-    bool isneg = class & is_neg;
-
      env->fpscr &= ~FP_FPRF;
-    env->fpscr |= fprf[ctz32(class)][isneg] << FPSCR_FPRF;
+    env->fpscr |= ret << FPSCR_FPRF;
  }
  
-#define COMPUTE_FPRF(tp)                                \
-void helper_compute_fprf_##tp(CPUPPCState *env, tp arg) \
-{                                                       \
-    set_fprf_from_class(env, tp##_classify(arg));       \
+#define COMPUTE_FPRF(tp)                                       \
+void helper_compute_fprf_##tp(CPUPPCState *env, tp arg)        \
+{                                                              \
+    int ret;                                                   \
+    if (tp##_is_normal(arg)) {                                 \
+        ret = 0x0408;                                          \
+    } else if (tp##_is_zero(arg)) {                            \
+        ret = 0x0212;                                          \
+    } else if (tp##_is_zero_or_denormal(arg)) {                \
+        ret = 0x1418;                                          \
+    } else if (unlikely(tp##_is_infinity(arg))) {              \
+        ret = 0x0509;                                          \
+    } else {                                                   \
+        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
+        ret = (tp##_is_signaling_nan(arg, &dummy)              \
+               ? 0x0000 : 0x1111);                             \
+    }                                                          \
+    set_fprf(env, tp##_is_neg(arg) ? (uint8_t)ret : ret >> 8); \
  }
  
  COMPUTE_FPRF(float16)


Not tested beyond compilation, but if Zoltan reports that it helps
I can write a commit message and submit it.

Paolo



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 14:33   ` Richard Henderson
  2023-05-23 17:51     ` BALATON Zoltan
  2023-05-25 13:22     ` Paolo Bonzini
@ 2023-05-25 13:30     ` Paolo Bonzini
  2023-05-25 23:19       ` BALATON Zoltan
  2 siblings, 1 reply; 14+ messages in thread
From: Paolo Bonzini @ 2023-05-25 13:30 UTC (permalink / raw)
  To: Richard Henderson, BALATON Zoltan, Alex Bennée
  Cc: qemu-devel, Aurelien Jarno, Peter Maydell

On 5/23/23 16:33, Richard Henderson wrote:
> 
> The tests are poorly ordered, testing many unlikely things before the 
> most likely thing (normal).  A better ordering would be
> 
>      if (likely(tp##_is_normal(arg))) {
>      } else if (tp##_is_zero(arg)) {
>      } else if (tp##_is_zero_or_denormal(arg)) {
>      } else if (tp##_is_infinity(arg)) {
>      } else {
>          // nan case
>      }

Might also benefit from a is_finite (true if zero or normal or denormal) 
predicate, to do

if (tp##_is_finite(arg)) {
     if (!tp##_is_zero_or_denormal(arg)) {
        // normal
     } else if (tp##_is_zero(arg)) {
     } else {
        // denormal
     }
} else if (tp##_is_infinity(arg)) {
} else {
     // nan
}

since is_normal is a bit more complex and inefficient than the others. 
The compiler should easily reuse the result of masking away the sign bit.

Paolo



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-25 13:22     ` Paolo Bonzini
@ 2023-05-25 13:30       ` Richard Henderson
  2023-05-25 23:15       ` BALATON Zoltan
  1 sibling, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-05-25 13:30 UTC (permalink / raw)
  To: Paolo Bonzini, BALATON Zoltan, Alex Bennée
  Cc: qemu-devel, Aurelien Jarno, Peter Maydell

On 5/25/23 06:22, Paolo Bonzini wrote:
> On 5/23/23 16:33, Richard Henderson wrote:
>>
>>
>> The tests are poorly ordered, testing many unlikely things before the most likely thing 
>> (normal).  A better ordering would be
>>
>>      if (likely(tp##_is_normal(arg))) {
>>      } else if (tp##_is_zero(arg)) {
>>      } else if (tp##_is_zero_or_denormal(arg)) {
>>      } else if (tp##_is_infinity(arg)) {
>>      } else {
>>          // nan case
>>      }
>>
>> Secondly, we compute the classify bitmask, and then deconstruct the mask again in 
>> set_fprf_from_class.  Since we don't use the classify bitmask for anything else, better 
>> would be to compute the fprf value directly in the if-ladder.
> 
> So something like this:
> 
> diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
> index a66e16c2128c..daed97ca178e 100644
> --- a/target/ppc/fpu_helper.c
> +++ b/target/ppc/fpu_helper.c
> @@ -141,62 +141,30 @@ static inline int ppc_float64_get_unbiased_exp(float64 f)
>       return ((f >> 52) & 0x7FF) - 1023;
>   }
> 
> -/* Classify a floating-point number.  */
> -enum {
> -    is_normal   = 1,
> -    is_zero     = 2,
> -    is_denormal = 4,
> -    is_inf      = 8,
> -    is_qnan     = 16,
> -    is_snan     = 32,
> -    is_neg      = 64,
> -};
> -
> -#define COMPUTE_CLASS(tp)                                      \
> -static int tp##_classify(tp arg)                               \
> -{                                                              \
> -    int ret = tp##_is_neg(arg) * is_neg;                       \
> -    if (unlikely(tp##_is_any_nan(arg))) {                      \
> -        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
> -        ret |= (tp##_is_signaling_nan(arg, &dummy)             \
> -                ? is_snan : is_qnan);                          \
> -    } else if (unlikely(tp##_is_infinity(arg))) {              \
> -        ret |= is_inf;                                         \
> -    } else if (tp##_is_zero(arg)) {                            \
> -        ret |= is_zero;                                        \
> -    } else if (tp##_is_zero_or_denormal(arg)) {                \
> -        ret |= is_denormal;                                    \
> -    } else {                                                   \
> -        ret |= is_normal;                                      \
> -    }                                                          \
> -    return ret;                                                \
> -}
> -
> -COMPUTE_CLASS(float16)
> -COMPUTE_CLASS(float32)
> -COMPUTE_CLASS(float64)
> -COMPUTE_CLASS(float128)
> -
> -static void set_fprf_from_class(CPUPPCState *env, int class)
> +static void set_fprf(CPUPPCState *env, uint8_t ret)
>   {
> -    static const uint8_t fprf[6][2] = {
> -        { 0x04, 0x08 },  /* normalized */
> -        { 0x02, 0x12 },  /* zero */
> -        { 0x14, 0x18 },  /* denormalized */
> -        { 0x05, 0x09 },  /* infinity */
> -        { 0x11, 0x11 },  /* qnan */
> -        { 0x00, 0x00 },  /* snan -- flags are undefined */
> -    };
> -    bool isneg = class & is_neg;
> -
>       env->fpscr &= ~FP_FPRF;
> -    env->fpscr |= fprf[ctz32(class)][isneg] << FPSCR_FPRF;
> +    env->fpscr |= ret << FPSCR_FPRF;
>   }
> 
> -#define COMPUTE_FPRF(tp)                                \
> -void helper_compute_fprf_##tp(CPUPPCState *env, tp arg) \
> -{                                                       \
> -    set_fprf_from_class(env, tp##_classify(arg));       \
> +#define COMPUTE_FPRF(tp)                                       \
> +void helper_compute_fprf_##tp(CPUPPCState *env, tp arg)        \
> +{                                                              \
> +    int ret;                                                   \
> +    if (tp##_is_normal(arg)) {                                 \
> +        ret = 0x0408;                                          \
> +    } else if (tp##_is_zero(arg)) {                            \
> +        ret = 0x0212;                                          \
> +    } else if (tp##_is_zero_or_denormal(arg)) {                \
> +        ret = 0x1418;                                          \
> +    } else if (unlikely(tp##_is_infinity(arg))) {              \
> +        ret = 0x0509;                                          \
> +    } else {                                                   \
> +        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
> +        ret = (tp##_is_signaling_nan(arg, &dummy)              \
> +               ? 0x0000 : 0x1111);                             \
> +    }                                                          \
> +    set_fprf(env, tp##_is_neg(arg) ? (uint8_t)ret : ret >> 8); \
>   }
> 
>   COMPUTE_FPRF(float16)
> 
> 
> Not tested beyond compilation, but if Zoltan reports that it helps
> I can write a commit message and submit it.

See

https://patchew.org/QEMU/20230523202507.688859-1-richard.henderson@linaro.org/

and follow-up.  It drops this one function in the profile, but only helps very slightly vs 
wall clock.


r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-25 13:22     ` Paolo Bonzini
  2023-05-25 13:30       ` Richard Henderson
@ 2023-05-25 23:15       ` BALATON Zoltan
  1 sibling, 0 replies; 14+ messages in thread
From: BALATON Zoltan @ 2023-05-25 23:15 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Richard Henderson, Alex Bennée, qemu-devel, Aurelien Jarno,
	Peter Maydell

[-- Attachment #1: Type: text/plain, Size: 5184 bytes --]

On Thu, 25 May 2023, Paolo Bonzini wrote:
> On 5/23/23 16:33, Richard Henderson wrote:
>> The tests are poorly ordered, testing many unlikely things before the most 
>> likely thing (normal).  A better ordering would be
>>
>>      if (likely(tp##_is_normal(arg))) {
>>      } else if (tp##_is_zero(arg)) {
>>      } else if (tp##_is_zero_or_denormal(arg)) {
>>      } else if (tp##_is_infinity(arg)) {
>>      } else {
>>          // nan case
>>      }
>> 
>> Secondly, we compute the classify bitmask, and then deconstruct the mask 
>> again in set_fprf_from_class.  Since we don't use the classify bitmask for 
>> anything else, better would be to compute the fprf value directly in the 
>> if-ladder.
>
> So something like this:
>
> diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c

I've tested this too (patch did not apply for some reason, had to apply by 
hand) but Richard's patch seems to get better results. This one also helps 
but takes a few seconds more than with Richard's patch and with that this 
functino gets 1% lower in profile.

Regards,
BALATON Zoltan

> index a66e16c2128c..daed97ca178e 100644
> --- a/target/ppc/fpu_helper.c
> +++ b/target/ppc/fpu_helper.c
> @@ -141,62 +141,30 @@ static inline int ppc_float64_get_unbiased_exp(float64 
> f)
>     return ((f >> 52) & 0x7FF) - 1023;
> }
> -/* Classify a floating-point number.  */
> -enum {
> -    is_normal   = 1,
> -    is_zero     = 2,
> -    is_denormal = 4,
> -    is_inf      = 8,
> -    is_qnan     = 16,
> -    is_snan     = 32,
> -    is_neg      = 64,
> -};
> -
> -#define COMPUTE_CLASS(tp)                                      \
> -static int tp##_classify(tp arg)                               \
> -{                                                              \
> -    int ret = tp##_is_neg(arg) * is_neg;                       \
> -    if (unlikely(tp##_is_any_nan(arg))) {                      \
> -        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
> -        ret |= (tp##_is_signaling_nan(arg, &dummy)             \
> -                ? is_snan : is_qnan);                          \
> -    } else if (unlikely(tp##_is_infinity(arg))) {              \
> -        ret |= is_inf;                                         \
> -    } else if (tp##_is_zero(arg)) {                            \
> -        ret |= is_zero;                                        \
> -    } else if (tp##_is_zero_or_denormal(arg)) {                \
> -        ret |= is_denormal;                                    \
> -    } else {                                                   \
> -        ret |= is_normal;                                      \
> -    }                                                          \
> -    return ret;                                                \
> -}
> -
> -COMPUTE_CLASS(float16)
> -COMPUTE_CLASS(float32)
> -COMPUTE_CLASS(float64)
> -COMPUTE_CLASS(float128)
> -
> -static void set_fprf_from_class(CPUPPCState *env, int class)
> +static void set_fprf(CPUPPCState *env, uint8_t ret)
> {
> -    static const uint8_t fprf[6][2] = {
> -        { 0x04, 0x08 },  /* normalized */
> -        { 0x02, 0x12 },  /* zero */
> -        { 0x14, 0x18 },  /* denormalized */
> -        { 0x05, 0x09 },  /* infinity */
> -        { 0x11, 0x11 },  /* qnan */
> -        { 0x00, 0x00 },  /* snan -- flags are undefined */
> -    };
> -    bool isneg = class & is_neg;
> -
>     env->fpscr &= ~FP_FPRF;
> -    env->fpscr |= fprf[ctz32(class)][isneg] << FPSCR_FPRF;
> +    env->fpscr |= ret << FPSCR_FPRF;
> }
> -#define COMPUTE_FPRF(tp)                                \
> -void helper_compute_fprf_##tp(CPUPPCState *env, tp arg) \
> -{                                                       \
> -    set_fprf_from_class(env, tp##_classify(arg));       \
> +#define COMPUTE_FPRF(tp)                                       \
> +void helper_compute_fprf_##tp(CPUPPCState *env, tp arg)        \
> +{                                                              \
> +    int ret;                                                   \
> +    if (tp##_is_normal(arg)) {                                 \
> +        ret = 0x0408;                                          \
> +    } else if (tp##_is_zero(arg)) {                            \
> +        ret = 0x0212;                                          \
> +    } else if (tp##_is_zero_or_denormal(arg)) {                \
> +        ret = 0x1418;                                          \
> +    } else if (unlikely(tp##_is_infinity(arg))) {              \
> +        ret = 0x0509;                                          \
> +    } else {                                                   \
> +        float_status dummy = { };  /* snan_bit_is_one = 0 */   \
> +        ret = (tp##_is_signaling_nan(arg, &dummy)              \
> +               ? 0x0000 : 0x1111);                             \
> +    }                                                          \
> +    set_fprf(env, tp##_is_neg(arg) ? (uint8_t)ret : ret >> 8); \
> }
>  COMPUTE_FPRF(float16)
>
>
> Not tested beyond compilation, but if Zoltan reports that it helps
> I can write a commit message and submit it.
>
> Paolo
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-25 13:30     ` Paolo Bonzini
@ 2023-05-25 23:19       ` BALATON Zoltan
  0 siblings, 0 replies; 14+ messages in thread
From: BALATON Zoltan @ 2023-05-25 23:19 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Richard Henderson, Alex Bennée, qemu-devel, Aurelien Jarno,
	Peter Maydell

[-- Attachment #1: Type: text/plain, Size: 1311 bytes --]

On Thu, 25 May 2023, Paolo Bonzini wrote:
> On 5/23/23 16:33, Richard Henderson wrote:
>> 
>> The tests are poorly ordered, testing many unlikely things before the most 
>> likely thing (normal).  A better ordering would be
>>
>>      if (likely(tp##_is_normal(arg))) {
>>      } else if (tp##_is_zero(arg)) {
>>      } else if (tp##_is_zero_or_denormal(arg)) {
>>      } else if (tp##_is_infinity(arg)) {
>>      } else {
>>          // nan case
>>      }
>
> Might also benefit from a is_finite (true if zero or normal or denormal) 
> predicate, to do
>
> if (tp##_is_finite(arg)) {

There seems to be only is_infinity but I'm not sure if is_finite would be 
the same as !is_infinity so could not try this. But it seems having any 
branches kills performance so adding more branches may not help (also 
because infinite values may be less frequent so not sure why this would 
be better).

Regards,
BALATON Zoltan

>    if (!tp##_is_zero_or_denormal(arg)) {
>       // normal
>    } else if (tp##_is_zero(arg)) {
>    } else {
>       // denormal
>    }
> } else if (tp##_is_infinity(arg)) {
> } else {
>    // nan
> }
>
> since is_normal is a bit more complex and inefficient than the others. The 
> compiler should easily reuse the result of masking away the sign bit.
>
> Paolo
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 13:57 ` BALATON Zoltan
  2023-05-23 14:33   ` Richard Henderson
@ 2023-05-26 11:56   ` BALATON Zoltan
  2023-06-22 20:55   ` BALATON Zoltan
  2 siblings, 0 replies; 14+ messages in thread
From: BALATON Zoltan @ 2023-05-26 11:56 UTC (permalink / raw)
  To: Alex Bennée
  Cc: qemu-devel, Richard Henderson, Aurelien Jarno, Peter Maydell

On Tue, 23 May 2023, BALATON Zoltan wrote:
> Unrelated to this patch I also started to see random crashes with a DSI on a 
> dcbz instruction now which did not happen before (or not frequently enough 
> for me to notice). I did not bisect that as it happens randomly but I wonder 
> if it could be related to recent unaligned access changes or some other TCG 
> change? Any idea what to check?

I've tried to bisect this but now I could also reproduce it with v8.0.0. 
Seems to depend on actions within the guest and happens only if I start 
something too early in the boot, so maybe it's a guest bug. If I wait for 
it to fully boot before starting a shell I don't get a crash. So maybe 
this is not QEMU related or if it is then it predates 8.0.

Regards,
BALATON Zoltan


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-05-23 13:57 ` BALATON Zoltan
  2023-05-23 14:33   ` Richard Henderson
  2023-05-26 11:56   ` BALATON Zoltan
@ 2023-06-22 20:55   ` BALATON Zoltan
  2023-06-23  5:50     ` Richard Henderson
  2 siblings, 1 reply; 14+ messages in thread
From: BALATON Zoltan @ 2023-06-22 20:55 UTC (permalink / raw)
  To: Alex Bennée
  Cc: qemu-devel, Richard Henderson, Aurelien Jarno, Peter Maydell

[-- Attachment #1: Type: text/plain, Size: 7432 bytes --]

Hello,

What happened to this patch? Will this be merged by somebody?

Regards,
BALATON Zoltan

On Tue, 23 May 2023, BALATON Zoltan wrote:
> On Tue, 23 May 2023, Alex Bennée wrote:
>> Balton discovered that asserts for the extract/deposit calls had a
>
> Missing an a in my name and my given name is Zoltan. (First name and last 
> name is in the other way in Hungarian.) Maybe just add a Reported-by instead 
> of here if you want to record it.
>
>> significant impact on a lame benchmark on qemu-ppc. Replicating with:
>>
>>  ./qemu-ppc64 ~/lsrc/tests/lame.git-svn/builds/ppc64/frontend/lame \
>>    -h pts-trondheim-3.wav pts-trondheim-3.mp3
>> 
>> showed up the pack/unpack routines not eliding the assert checks as it
>> should have done causing them to prominently figure in the profile:
>> 
>>  11.44%  qemu-ppc64  qemu-ppc64               [.] unpack_raw64.isra.0
>>  11.03%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
>>   8.26%  qemu-ppc64  qemu-ppc64               [.] 
>> helper_compute_fprf_float64
>>   6.75%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
>>   5.34%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
>>   4.75%  qemu-ppc64  qemu-ppc64               [.] pack_raw64.isra.0
>>   4.38%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize
>>   3.62%  qemu-ppc64  qemu-ppc64               [.] 
>> float64r32_round_pack_canonical
>> 
>> After this patch the same test runs 31 seconds faster with a profile
>> where the generated code dominates more:
>> 
>> +   14.12%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x0000004000619420
>> +   13.30%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x0000004000616850
>> +   12.58%    12.19%  qemu-ppc64  qemu-ppc64               [.] 
>> parts64_uncanon_normal
>> +   10.62%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x000000400061bf70
>> +    9.91%     9.73%  qemu-ppc64  qemu-ppc64               [.] 
>> helper_compute_fprf_float64
>> +    7.84%     7.82%  qemu-ppc64  qemu-ppc64               [.] 
>> do_float_check_status
>> +    6.47%     5.78%  qemu-ppc64  qemu-ppc64               [.] 
>> parts64_canonicalize.constprop.0
>> +    6.46%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x0000004000620130
>> +    6.42%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x0000004000619400
>> +    6.17%     6.04%  qemu-ppc64  qemu-ppc64               [.] 
>> parts64_muladd
>> +    5.85%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x00000040006167e0
>> +    5.74%     0.00%  qemu-ppc64  [unknown]                [.] 
>> 0x0000b693fcffffd3
>> +    5.45%     4.78%  qemu-ppc64  qemu-ppc64               [.] 
>> float64r32_round_pack_canonical
>> 
>> Suggested-by: Richard Henderson <richard.henderson@linaro.org>
>> Message-Id: <ec9cfe5a-d5f2-466d-34dc-c35817e7e010@linaro.org>
>> [AJB: Patchified rth's suggestion]
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: BALATON Zoltan <balaton@eik.bme.hu>
>
> Replace Cc: with
> Tested-by: BALATON Zoltan <balaton@eik.bme.hu>
>
> This solves the softfloat related usages, the rest probably are lower 
> overhead, I could not measure any more improvement with removing asserts on 
> top of this patch. I still have these functions high in my profiling result:
>
> children  self    command          symbol
> 11.40%    10.86%  qemu-system-ppc  helper_compute_fprf_float64
> 11.25%     0.61%  qemu-system-ppc  helper_fmadds
> 10.01%     3.23%  qemu-system-ppc  float64r32_round_pack_canonical
> 8.59%     1.80%  qemu-system-ppc  helper_float_check_status
> 8.34%     7.23%  qemu-system-ppc  parts64_muladd
> 8.16%     0.67%  qemu-system-ppc  helper_fmuls
> 8.08%     0.43%  qemu-system-ppc  parts64_uncanon
> 7.49%     1.78%  qemu-system-ppc  float64r32_mul
> 7.32%     7.32%  qemu-system-ppc  parts64_uncanon_normal
> 6.48%     0.52%  qemu-system-ppc  helper_fadds
> 6.31%     6.31%  qemu-system-ppc  do_float_check_status
> 5.99%     1.14%  qemu-system-ppc  float64r32_add
>
> Any idea on those?
>
> Unrelated to this patch I also started to see random crashes with a DSI on a 
> dcbz instruction now which did not happen before (or not frequently enough 
> for me to notice). I did not bisect that as it happens randomly but I wonder 
> if it could be related to recent unaligned access changes or some other TCG 
> change? Any idea what to check?
>
> Regards,
> BALATON Zoltan
>
>> ---
>> fpu/softfloat.c | 22 +++++++++++-----------
>> 1 file changed, 11 insertions(+), 11 deletions(-)
>> 
>> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
>> index 108f9cb224..42e6c188b4 100644
>> --- a/fpu/softfloat.c
>> +++ b/fpu/softfloat.c
>> @@ -593,27 +593,27 @@ static void unpack_raw64(FloatParts64 *r, const 
>> FloatFmt *fmt, uint64_t raw)
>>     };
>> }
>> 
>> -static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
>> +static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
>> {
>>     unpack_raw64(p, &float16_params, f);
>> }
>> 
>> -static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
>> +static void QEMU_FLATTEN bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
>> {
>>     unpack_raw64(p, &bfloat16_params, f);
>> }
>> 
>> -static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
>> +static void QEMU_FLATTEN float32_unpack_raw(FloatParts64 *p, float32 f)
>> {
>>     unpack_raw64(p, &float32_params, f);
>> }
>> 
>> -static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
>> +static void QEMU_FLATTEN float64_unpack_raw(FloatParts64 *p, float64 f)
>> {
>>     unpack_raw64(p, &float64_params, f);
>> }
>> 
>> -static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
>> +static void QEMU_FLATTEN floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
>> {
>>     *p = (FloatParts128) {
>>         .cls = float_class_unclassified,
>> @@ -623,7 +623,7 @@ static void floatx80_unpack_raw(FloatParts128 *p, 
>> floatx80 f)
>>     };
>> }
>> 
>> -static void float128_unpack_raw(FloatParts128 *p, float128 f)
>> +static void QEMU_FLATTEN float128_unpack_raw(FloatParts128 *p, float128 f)
>> {
>>     const int f_size = float128_params.frac_size - 64;
>>     const int e_size = float128_params.exp_size;
>> @@ -650,27 +650,27 @@ static uint64_t pack_raw64(const FloatParts64 *p, 
>> const FloatFmt *fmt)
>>     return ret;
>> }
>> 
>> -static inline float16 float16_pack_raw(const FloatParts64 *p)
>> +static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
>> {
>>     return make_float16(pack_raw64(p, &float16_params));
>> }
>> 
>> -static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
>> +static bfloat16 QEMU_FLATTEN bfloat16_pack_raw(const FloatParts64 *p)
>> {
>>     return pack_raw64(p, &bfloat16_params);
>> }
>> 
>> -static inline float32 float32_pack_raw(const FloatParts64 *p)
>> +static float32 QEMU_FLATTEN float32_pack_raw(const FloatParts64 *p)
>> {
>>     return make_float32(pack_raw64(p, &float32_params));
>> }
>> 
>> -static inline float64 float64_pack_raw(const FloatParts64 *p)
>> +static float64 QEMU_FLATTEN float64_pack_raw(const FloatParts64 *p)
>> {
>>     return make_float64(pack_raw64(p, &float64_params));
>> }
>> 
>> -static float128 float128_pack_raw(const FloatParts128 *p)
>> +static float128 QEMU_FLATTEN float128_pack_raw(const FloatParts128 *p)
>> {
>>     const int f_size = float128_params.frac_size - 64;
>>     const int e_size = float128_params.exp_size;
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining
  2023-06-22 20:55   ` BALATON Zoltan
@ 2023-06-23  5:50     ` Richard Henderson
  0 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-06-23  5:50 UTC (permalink / raw)
  To: BALATON Zoltan, Alex Bennée
  Cc: qemu-devel, Aurelien Jarno, Peter Maydell

On 6/22/23 22:55, BALATON Zoltan wrote:
> Hello,
> 
> What happened to this patch? Will this be merged by somebody?

Thanks for the reminder.  Queued to tcg-next.

r~

> 
> Regards,
> BALATON Zoltan
> 
> On Tue, 23 May 2023, BALATON Zoltan wrote:
>> On Tue, 23 May 2023, Alex Bennée wrote:
>>> Balton discovered that asserts for the extract/deposit calls had a
>>
>> Missing an a in my name and my given name is Zoltan. (First name and last name is in the 
>> other way in Hungarian.) Maybe just add a Reported-by instead of here if you want to 
>> record it.
>>
>>> significant impact on a lame benchmark on qemu-ppc. Replicating with:
>>>
>>>  ./qemu-ppc64 ~/lsrc/tests/lame.git-svn/builds/ppc64/frontend/lame \
>>>    -h pts-trondheim-3.wav pts-trondheim-3.mp3
>>>
>>> showed up the pack/unpack routines not eliding the assert checks as it
>>> should have done causing them to prominently figure in the profile:
>>>
>>>  11.44%  qemu-ppc64  qemu-ppc64               [.] unpack_raw64.isra.0
>>>  11.03%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
>>>   8.26%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
>>>   6.75%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
>>>   5.34%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
>>>   4.75%  qemu-ppc64  qemu-ppc64               [.] pack_raw64.isra.0
>>>   4.38%  qemu-ppc64  qemu-ppc64               [.] parts64_canonicalize
>>>   3.62%  qemu-ppc64  qemu-ppc64               [.] float64r32_round_pack_canonical
>>>
>>> After this patch the same test runs 31 seconds faster with a profile
>>> where the generated code dominates more:
>>>
>>> +   14.12%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619420
>>> +   13.30%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000616850
>>> +   12.58%    12.19%  qemu-ppc64  qemu-ppc64               [.] parts64_uncanon_normal
>>> +   10.62%     0.00%  qemu-ppc64  [unknown]                [.] 0x000000400061bf70
>>> +    9.91%     9.73%  qemu-ppc64  qemu-ppc64               [.] helper_compute_fprf_float64
>>> +    7.84%     7.82%  qemu-ppc64  qemu-ppc64               [.] do_float_check_status
>>> +    6.47%     5.78%  qemu-ppc64  qemu-ppc64               [.] 
>>> parts64_canonicalize.constprop.0
>>> +    6.46%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000620130
>>> +    6.42%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000004000619400
>>> +    6.17%     6.04%  qemu-ppc64  qemu-ppc64               [.] parts64_muladd
>>> +    5.85%     0.00%  qemu-ppc64  [unknown]                [.] 0x00000040006167e0
>>> +    5.74%     0.00%  qemu-ppc64  [unknown]                [.] 0x0000b693fcffffd3
>>> +    5.45%     4.78%  qemu-ppc64  qemu-ppc64               [.] 
>>> float64r32_round_pack_canonical
>>>
>>> Suggested-by: Richard Henderson <richard.henderson@linaro.org>
>>> Message-Id: <ec9cfe5a-d5f2-466d-34dc-c35817e7e010@linaro.org>
>>> [AJB: Patchified rth's suggestion]
>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>> Cc: BALATON Zoltan <balaton@eik.bme.hu>
>>
>> Replace Cc: with
>> Tested-by: BALATON Zoltan <balaton@eik.bme.hu>
>>
>> This solves the softfloat related usages, the rest probably are lower overhead, I could 
>> not measure any more improvement with removing asserts on top of this patch. I still 
>> have these functions high in my profiling result:
>>
>> children  self    command          symbol
>> 11.40%    10.86%  qemu-system-ppc  helper_compute_fprf_float64
>> 11.25%     0.61%  qemu-system-ppc  helper_fmadds
>> 10.01%     3.23%  qemu-system-ppc  float64r32_round_pack_canonical
>> 8.59%     1.80%  qemu-system-ppc  helper_float_check_status
>> 8.34%     7.23%  qemu-system-ppc  parts64_muladd
>> 8.16%     0.67%  qemu-system-ppc  helper_fmuls
>> 8.08%     0.43%  qemu-system-ppc  parts64_uncanon
>> 7.49%     1.78%  qemu-system-ppc  float64r32_mul
>> 7.32%     7.32%  qemu-system-ppc  parts64_uncanon_normal
>> 6.48%     0.52%  qemu-system-ppc  helper_fadds
>> 6.31%     6.31%  qemu-system-ppc  do_float_check_status
>> 5.99%     1.14%  qemu-system-ppc  float64r32_add
>>
>> Any idea on those?
>>
>> Unrelated to this patch I also started to see random crashes with a DSI on a dcbz 
>> instruction now which did not happen before (or not frequently enough for me to notice). 
>> I did not bisect that as it happens randomly but I wonder if it could be related to 
>> recent unaligned access changes or some other TCG change? Any idea what to check?
>>
>> Regards,
>> BALATON Zoltan
>>
>>> ---
>>> fpu/softfloat.c | 22 +++++++++++-----------
>>> 1 file changed, 11 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
>>> index 108f9cb224..42e6c188b4 100644
>>> --- a/fpu/softfloat.c
>>> +++ b/fpu/softfloat.c
>>> @@ -593,27 +593,27 @@ static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, 
>>> uint64_t raw)
>>>     };
>>> }
>>>
>>> -static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
>>> +static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
>>> {
>>>     unpack_raw64(p, &float16_params, f);
>>> }
>>>
>>> -static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
>>> +static void QEMU_FLATTEN bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
>>> {
>>>     unpack_raw64(p, &bfloat16_params, f);
>>> }
>>>
>>> -static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
>>> +static void QEMU_FLATTEN float32_unpack_raw(FloatParts64 *p, float32 f)
>>> {
>>>     unpack_raw64(p, &float32_params, f);
>>> }
>>>
>>> -static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
>>> +static void QEMU_FLATTEN float64_unpack_raw(FloatParts64 *p, float64 f)
>>> {
>>>     unpack_raw64(p, &float64_params, f);
>>> }
>>>
>>> -static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
>>> +static void QEMU_FLATTEN floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
>>> {
>>>     *p = (FloatParts128) {
>>>         .cls = float_class_unclassified,
>>> @@ -623,7 +623,7 @@ static void floatx80_unpack_raw(FloatParts128 *p, floatx80 f)
>>>     };
>>> }
>>>
>>> -static void float128_unpack_raw(FloatParts128 *p, float128 f)
>>> +static void QEMU_FLATTEN float128_unpack_raw(FloatParts128 *p, float128 f)
>>> {
>>>     const int f_size = float128_params.frac_size - 64;
>>>     const int e_size = float128_params.exp_size;
>>> @@ -650,27 +650,27 @@ static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt 
>>> *fmt)
>>>     return ret;
>>> }
>>>
>>> -static inline float16 float16_pack_raw(const FloatParts64 *p)
>>> +static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
>>> {
>>>     return make_float16(pack_raw64(p, &float16_params));
>>> }
>>>
>>> -static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
>>> +static bfloat16 QEMU_FLATTEN bfloat16_pack_raw(const FloatParts64 *p)
>>> {
>>>     return pack_raw64(p, &bfloat16_params);
>>> }
>>>
>>> -static inline float32 float32_pack_raw(const FloatParts64 *p)
>>> +static float32 QEMU_FLATTEN float32_pack_raw(const FloatParts64 *p)
>>> {
>>>     return make_float32(pack_raw64(p, &float32_params));
>>> }
>>>
>>> -static inline float64 float64_pack_raw(const FloatParts64 *p)
>>> +static float64 QEMU_FLATTEN float64_pack_raw(const FloatParts64 *p)
>>> {
>>>     return make_float64(pack_raw64(p, &float64_params));
>>> }
>>>
>>> -static float128 float128_pack_raw(const FloatParts128 *p)
>>> +static float128 QEMU_FLATTEN float128_pack_raw(const FloatParts128 *p)
>>> {
>>>     const int f_size = float128_params.frac_size - 64;
>>>     const int e_size = float128_params.exp_size;
>>



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-06-23  5:51 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-05-23 13:11 [RFC PATCH] softfloat: use QEMU_FLATTEN to avoid mistaken isra inlining Alex Bennée
2023-05-23 13:57 ` BALATON Zoltan
2023-05-23 14:33   ` Richard Henderson
2023-05-23 17:51     ` BALATON Zoltan
2023-05-25 13:22     ` Paolo Bonzini
2023-05-25 13:30       ` Richard Henderson
2023-05-25 23:15       ` BALATON Zoltan
2023-05-25 13:30     ` Paolo Bonzini
2023-05-25 23:19       ` BALATON Zoltan
2023-05-26 11:56   ` BALATON Zoltan
2023-06-22 20:55   ` BALATON Zoltan
2023-06-23  5:50     ` Richard Henderson
2023-05-23 14:18 ` Philippe Mathieu-Daudé
2023-05-23 15:34 ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).