Re: [patch 5/6] [RFD] timekeeping: Provide optional 128bit math

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Thomas Gleixner <tglx@linutronix.de>
Cc: LKML <linux-kernel@vger.kernel.org>,
	John Stultz <john.stultz@linaro.org>,
	Ingo Molnar <mingo@kernel.org>,
	David Gibson <david@gibson.dropbear.id.au>,
	Liav Rehana <liavr@mellanox.com>,
	Chris Metcalf <cmetcalf@mellanox.com>,
	Richard Cochran <richardcochran@gmail.com>,
	Parit Bhargava <prarit@redhat.com>,
	Laurent Vivier <lvivier@redhat.com>,
	"Christopher S. Hall" <christopher.s.hall@intel.com>
Subject: Re: [patch 5/6] [RFD] timekeeping: Provide optional 128bit math
Date: Fri, 9 Dec 2016 09:30:11 +0100	[thread overview]
Message-ID: <20161209083011.GD15765@worktop.programming.kicks-ass.net> (raw)
In-Reply-To: <20161209063847.GC15765@worktop.programming.kicks-ass.net>

On Fri, Dec 09, 2016 at 07:38:47AM +0100, Peter Zijlstra wrote:
> On Fri, Dec 09, 2016 at 06:26:38AM +0100, Peter Zijlstra wrote:

> > Just for giggles, on tilegx the branch is actually slower than doing the
> > mult unconditionally.
> > 
> > The problem is that the two multiplies would otherwise completely
> > pipeline, whereas with the conditional you serialize them.
> 
> On my Haswell laptop the unconditional version is faster too.

Only when using x86_64 instructions, once I fixed the i386 variant it
was slower, probably due to register pressure and the like.

> > (came to light while talking about why the mul_u64_u32_shr() fallback
> > didn't work right for them, which was a combination of the above issue
> > and the fact that their compiler 'lost' the fact that these are
> > 32x32->64 mults and did 64x64 ones instead).
> 
> Turns out using GCC-6.2.1 we have the same problem on i386, GCC doesn't
> recognise the 32x32 mults and generates crap.
> 
> This used to work :/

Do we want something like so?

---
 arch/tile/include/asm/Kbuild  |  1 -
 arch/tile/include/asm/div64.h | 14 ++++++++++++++
 arch/x86/include/asm/div64.h  | 10 ++++++++++
 include/linux/math64.h        | 26 ++++++++++++++++++--------
 4 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 2d1f5638974c..20f2ba6d79be 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -5,7 +5,6 @@ generic-y += bug.h
 generic-y += bugs.h
 generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h
index e69de29bb2d1..bf6161966dfa 100644
--- a/arch/tile/include/asm/div64.h
+++ b/arch/tile/include/asm/div64.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_TILE_DIV64_H
+#define _ASM_TILE_DIV64_H
+
+#ifdef __tilegx__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return __insn_mul_lu_lu(a, b);
+}
+#define mul_u32_u32 mul_u32_u32
+#endif
+
+#include <asm-generic/div64.h>
+
+#endif /* _ASM_TILE_DIV64_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index ced283ac79df..68f4ae5e8976 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -59,6 +59,16 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 }
 #define div_u64_rem	div_u64_rem
 
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	u64 ret;
+
+	asm ("mull %[b]" : "=A" (ret) : [a] "a" (a), [b] "g" (b) );
+
+	return ret;
+}
+#define mul_u32_u32 mul_u32_u32
+
 #else
 # include <asm-generic/div64.h>
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6e8b5b270ffe..80690c96c734 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,6 +133,16 @@ static inline s64 div_s64(s64 dividend, s32 divisor)
 	return ret;
 }
 
+#ifndef mul_u32_u32
+/*
+ * Many a GCC version messes this up and generates a 64x64 mult :-(
+ */
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return (u64)a * b;
+}
+#endif
+
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
 #ifndef mul_u64_u32_shr
@@ -160,9 +170,9 @@ static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 	al = a;
 	ah = a >> 32;
 
-	ret = ((u64)al * mul) >> shift;
+	ret = mul_u32_u32(al, mul) >> shift;
 	if (ah)
-		ret += ((u64)ah * mul) << (32 - shift);
+		ret += mul_u32_u32(ah, mul) << (32 - shift);
 
 	return ret;
 }
@@ -186,10 +196,10 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
 	a0.ll = a;
 	b0.ll = b;
 
-	rl.ll = (u64)a0.l.low * b0.l.low;
-	rm.ll = (u64)a0.l.low * b0.l.high;
-	rn.ll = (u64)a0.l.high * b0.l.low;
-	rh.ll = (u64)a0.l.high * b0.l.high;
+	rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
+	rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
+	rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
+	rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
 
 	/*
 	 * Each of these lines computes a 64-bit intermediate result into "c",
@@ -229,8 +239,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 	} u, rl, rh;
 
 	u.ll = a;
-	rl.ll = (u64)u.l.low * mul;
-	rh.ll = (u64)u.l.high * mul + rl.l.high;
+	rl.ll = mul_u32_u32(u.l.low, mul);
+	rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high;
 
 	/* Bits 32-63 of the result will be in rh.l.low. */
 	rl.l.high = do_div(rh.ll, divisor);

next prev parent reply	other threads:[~2016-12-09  8:30 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-08 20:49 [patch 0/6] timekeeping: Cure the signed/unsigned wreckage Thomas Gleixner
2016-12-08 20:49 ` [patch 1/6] timekeeping: Force unsigned clocksource to nanoseconds conversion Thomas Gleixner
2016-12-08 23:38   ` David Gibson
2016-12-09 11:13   ` [tip:timers/core] timekeeping_Force_unsigned_clocksource_to_nanoseconds_conversion tip-bot for Thomas Gleixner
2016-12-08 20:49 ` [patch 2/6] timekeeping: Make the conversion call chain consistently unsigned Thomas Gleixner
2016-12-08 23:39   ` David Gibson
2016-12-09 11:13   ` [tip:timers/core] " tip-bot for Thomas Gleixner
2016-12-08 20:49 ` [patch 3/6] timekeeping: Get rid of pointless typecasts Thomas Gleixner
2016-12-08 23:40   ` David Gibson
2016-12-09 11:14   ` [tip:timers/core] " tip-bot for Thomas Gleixner
2016-12-08 20:49 ` [patch 4/6] timekeeping: Use mul_u64_u32_shr() instead of open coding it Thomas Gleixner
2016-12-08 23:41   ` David Gibson
2016-12-09 11:14   ` [tip:timers/core] " tip-bot for Thomas Gleixner
2016-12-08 20:49 ` [patch 5/6] [RFD] timekeeping: Provide optional 128bit math Thomas Gleixner
2016-12-09  4:08   ` Ingo Molnar
2016-12-09  4:29     ` Ingo Molnar
2016-12-09  4:39       ` John Stultz
2016-12-09  4:48     ` Peter Zijlstra
2016-12-09  5:22       ` Ingo Molnar
2016-12-09  5:41         ` Peter Zijlstra
2016-12-09  5:11   ` Peter Zijlstra
2016-12-09  6:08     ` Peter Zijlstra
2016-12-09  5:26   ` Peter Zijlstra
2016-12-09  6:38     ` Peter Zijlstra
2016-12-09  8:30       ` Peter Zijlstra [this message]
2016-12-09  9:11         ` Peter Zijlstra
2016-12-09 10:01         ` Peter Zijlstra
2016-12-09 17:32         ` Chris Metcalf
2017-01-14 12:51         ` [tip:timers/core] math64, timers: Fix 32bit mul_u64_u32_shr() and friends tip-bot for Peter Zijlstra
2016-12-09 10:18       ` [patch 5/6] [RFD] timekeeping: Provide optional 128bit math Peter Zijlstra
2016-12-09 17:20         ` Chris Metcalf
2016-12-08 20:49 ` [patch 6/6] [RFD] timekeeping: Get rid of cycle_t Thomas Gleixner
2016-12-08 23:43   ` David Gibson
2016-12-09  4:52 ` [patch 0/6] timekeeping: Cure the signed/unsigned wreckage John Stultz
2016-12-09  5:30 ` Peter Zijlstra

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2d1f5638974 dfblob:20f2ba6d79b dfblob:e69de29bb2d
dfblob:bf6161966df dfblob:ced283ac79d dfblob:68f4ae5e897
dfblob:6e8b5b270ff dfblob:80690c96c73 )
 OR (
bs:"Re: [patch 5/6] [RFD] timekeeping: Provide optional 128bit math" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161209083011.GD15765@worktop.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=christopher.s.hall@intel.com \
    --cc=cmetcalf@mellanox.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=john.stultz@linaro.org \
    --cc=liavr@mellanox.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lvivier@redhat.com \
    --cc=mingo@kernel.org \
    --cc=prarit@redhat.com \
    --cc=richardcochran@gmail.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.