* [PATCH v5 next 1/9] lib: mul_u64_u64_div_u64() rename parameter 'c' to 'd'
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 20:10 ` [PATCH v5 next 2/9] lib: mul_u64_u64_div_u64() Combine overflow and divide by zero checks David Laight
` (7 subsequent siblings)
8 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
Change to prototype from mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
to mul_u64_u64_div_u64(u64 a, u64 b, u64 d).
Using 'd' for 'divisor' makes more sense.
An upcoming change adds a 'c' parameter to calculate (a * b + c)/d.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
lib/math/div64.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index bf77b9843175..0ebff850fd4d 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -184,10 +184,10 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
EXPORT_SYMBOL(iter_div_u64_rem);
#ifndef mul_u64_u64_div_u64
-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
{
if (ilog2(a) + ilog2(b) <= 62)
- return div64_u64(a * b, c);
+ return div64_u64(a * b, d);
#if defined(__SIZEOF_INT128__)
@@ -212,37 +212,37 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
#endif
- /* make sure c is not zero, trigger runtime exception otherwise */
- if (unlikely(c == 0)) {
+ /* make sure d is not zero, trigger runtime exception otherwise */
+ if (unlikely(d == 0)) {
unsigned long zero = 0;
OPTIMIZER_HIDE_VAR(zero);
return ~0UL/zero;
}
- int shift = __builtin_ctzll(c);
+ int shift = __builtin_ctzll(d);
/* try reducing the fraction in case the dividend becomes <= 64 bits */
if ((n_hi >> shift) == 0) {
u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo;
- return div64_u64(n, c >> shift);
+ return div64_u64(n, d >> shift);
/*
* The remainder value if needed would be:
- * res = div64_u64_rem(n, c >> shift, &rem);
+ * res = div64_u64_rem(n, d >> shift, &rem);
* rem = (rem << shift) + (n_lo - (n << shift));
*/
}
- if (n_hi >= c) {
+ if (n_hi >= d) {
/* overflow: result is unrepresentable in a u64 */
return -1;
}
/* Do the full 128 by 64 bits division */
- shift = __builtin_clzll(c);
- c <<= shift;
+ shift = __builtin_clzll(d);
+ d <<= shift;
int p = 64 + shift;
u64 res = 0;
@@ -257,8 +257,8 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
n_hi <<= shift;
n_hi |= n_lo >> (64 - shift);
n_lo <<= shift;
- if (carry || (n_hi >= c)) {
- n_hi -= c;
+ if (carry || (n_hi >= d)) {
+ n_hi -= d;
res |= 1ULL << p;
}
} while (n_hi);
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v5 next 2/9] lib: mul_u64_u64_div_u64() Combine overflow and divide by zero checks
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
2025-11-05 20:10 ` [PATCH v5 next 1/9] lib: mul_u64_u64_div_u64() rename parameter 'c' to 'd' David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 20:10 ` [PATCH v5 next 3/9] lib: mul_u64_u64_div_u64() simplify check for a 64bit product David Laight
` (6 subsequent siblings)
8 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
Since the overflow check always triggers when the divisor is zero
move the check for divide by zero inside the overflow check.
This means there is only one test in the normal path.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
V3 contained a different patch 2 that did different changes to the
error paths.
lib/math/div64.c | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 0ebff850fd4d..1092f41e878e 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -212,12 +212,16 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
#endif
- /* make sure d is not zero, trigger runtime exception otherwise */
- if (unlikely(d == 0)) {
- unsigned long zero = 0;
+ if (unlikely(n_hi >= d)) {
+ /* trigger runtime exception if divisor is zero */
+ if (d == 0) {
+ unsigned long zero = 0;
- OPTIMIZER_HIDE_VAR(zero);
- return ~0UL/zero;
+ OPTIMIZER_HIDE_VAR(zero);
+ return ~0UL/zero;
+ }
+ /* overflow: result is unrepresentable in a u64 */
+ return ~0ULL;
}
int shift = __builtin_ctzll(d);
@@ -234,11 +238,6 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
*/
}
- if (n_hi >= d) {
- /* overflow: result is unrepresentable in a u64 */
- return -1;
- }
-
/* Do the full 128 by 64 bits division */
shift = __builtin_clzll(d);
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v5 next 3/9] lib: mul_u64_u64_div_u64() simplify check for a 64bit product
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
2025-11-05 20:10 ` [PATCH v5 next 1/9] lib: mul_u64_u64_div_u64() rename parameter 'c' to 'd' David Laight
2025-11-05 20:10 ` [PATCH v5 next 2/9] lib: mul_u64_u64_div_u64() Combine overflow and divide by zero checks David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 20:59 ` Nicolas Pitre
2025-11-05 20:10 ` [PATCH v5 next 4/9] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup() David Laight
` (5 subsequent siblings)
8 siblings, 1 reply; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
If the product is only 64bits div64_u64() can be used for the divide.
Replace the pre-multiply check (ilog2(a) + ilog2(b) <= 62) with a
simple post-multiply check that the high 64bits are zero.
This has the advantage of being simpler, more accurate and less code.
It will always be faster when the product is larger than 64bits.
Most 64bit cpu have a native 64x64=128 bit multiply, this is needed
(for the low 64bits) even when div64_u64() is called - so the early
check gains nothing and is just extra code.
32bit cpu will need a compare (etc) to generate the 64bit ilog2()
from two 32bit bit scans - so that is non-trivial.
(Never mind the mess of x86's 'bsr' and any oddball cpu without
fast bit-scan instructions.)
Whereas the additional instructions for the 128bit multiply result
are pretty much one multiply and two adds (typically the 'adc $0,%reg'
can be run in parallel with the instruction that follows).
The only outliers are 64bit systems without 128bit mutiply and
simple in order 32bit ones with fast bit scan but needing extra
instructions to get the high bits of the multiply result.
I doubt it makes much difference to either, the latter is definitely
not mainstream.
If anyone is worried about the analysis they can look at the
generated code for x86 (especially when cmov isn't used).
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
Split from patch 3 for v2.
Changes for v5:
- Test for small dividends before overflow.
lib/math/div64.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 1092f41e878e..4a4b1aa9e6e1 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -186,9 +186,6 @@ EXPORT_SYMBOL(iter_div_u64_rem);
#ifndef mul_u64_u64_div_u64
u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
{
- if (ilog2(a) + ilog2(b) <= 62)
- return div64_u64(a * b, d);
-
#if defined(__SIZEOF_INT128__)
/* native 64x64=128 bits multiplication */
@@ -212,6 +209,9 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
#endif
+ if (!n_hi)
+ return div64_u64(n_lo, d);
+
if (unlikely(n_hi >= d)) {
/* trigger runtime exception if divisor is zero */
if (d == 0) {
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v5 next 3/9] lib: mul_u64_u64_div_u64() simplify check for a 64bit product
2025-11-05 20:10 ` [PATCH v5 next 3/9] lib: mul_u64_u64_div_u64() simplify check for a 64bit product David Laight
@ 2025-11-05 20:59 ` Nicolas Pitre
0 siblings, 0 replies; 15+ messages in thread
From: Nicolas Pitre @ 2025-11-05 20:59 UTC (permalink / raw)
To: David Laight
Cc: Andrew Morton, linux-kernel, u.kleine-koenig, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
On Wed, 5 Nov 2025, David Laight wrote:
> If the product is only 64bits div64_u64() can be used for the divide.
> Replace the pre-multiply check (ilog2(a) + ilog2(b) <= 62) with a
> simple post-multiply check that the high 64bits are zero.
>
> This has the advantage of being simpler, more accurate and less code.
> It will always be faster when the product is larger than 64bits.
>
> Most 64bit cpu have a native 64x64=128 bit multiply, this is needed
> (for the low 64bits) even when div64_u64() is called - so the early
> check gains nothing and is just extra code.
>
> 32bit cpu will need a compare (etc) to generate the 64bit ilog2()
> from two 32bit bit scans - so that is non-trivial.
> (Never mind the mess of x86's 'bsr' and any oddball cpu without
> fast bit-scan instructions.)
> Whereas the additional instructions for the 128bit multiply result
> are pretty much one multiply and two adds (typically the 'adc $0,%reg'
> can be run in parallel with the instruction that follows).
>
> The only outliers are 64bit systems without 128bit mutiply and
> simple in order 32bit ones with fast bit scan but needing extra
> instructions to get the high bits of the multiply result.
> I doubt it makes much difference to either, the latter is definitely
> not mainstream.
>
> If anyone is worried about the analysis they can look at the
> generated code for x86 (especially when cmov isn't used).
>
> Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
> ---
>
> Split from patch 3 for v2.
>
> Changes for v5:
> - Test for small dividends before overflow.
>
> lib/math/div64.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/lib/math/div64.c b/lib/math/div64.c
> index 1092f41e878e..4a4b1aa9e6e1 100644
> --- a/lib/math/div64.c
> +++ b/lib/math/div64.c
> @@ -186,9 +186,6 @@ EXPORT_SYMBOL(iter_div_u64_rem);
> #ifndef mul_u64_u64_div_u64
> u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
> {
> - if (ilog2(a) + ilog2(b) <= 62)
> - return div64_u64(a * b, d);
> -
> #if defined(__SIZEOF_INT128__)
>
> /* native 64x64=128 bits multiplication */
> @@ -212,6 +209,9 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
>
> #endif
>
> + if (!n_hi)
> + return div64_u64(n_lo, d);
> +
> if (unlikely(n_hi >= d)) {
> /* trigger runtime exception if divisor is zero */
> if (d == 0) {
> --
> 2.39.5
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v5 next 4/9] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
` (2 preceding siblings ...)
2025-11-05 20:10 ` [PATCH v5 next 3/9] lib: mul_u64_u64_div_u64() simplify check for a 64bit product David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-06 0:26 ` H. Peter Anvin
2025-11-05 20:10 ` [PATCH v5 next 5/9] lib: Add tests for mul_u64_u64_div_u64_roundup() David Laight
` (4 subsequent siblings)
8 siblings, 1 reply; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
variant needs 'divisor - 1' adding in between the multiply and
divide so cannot easily be done by a caller.
Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
and implement the 'round down' and 'round up' using it.
Update the x86-64 asm to optimise for 'c' being a constant zero.
Add kerndoc definitions for all three functions.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
Changes for v2 (formally patch 1/3):
- Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
Although I'm not convinced the path is common enough to be worth
the two ilog2() calls.
Changes for v3 (formally patch 3/4):
- The early call to div64_u64() has been removed by patch 3.
Pretty much guaranteed to be a pessimisation.
Changes for v4:
- For x86-64 split the multiply, add and divide into three asm blocks.
(gcc makes a pigs breakfast of (u128)a * b + c)
- Change the kerndoc since divide by zero will (probably) fault.
Changes for v5:
- Fix test that excludes the add/adc asm block for constant zero 'add'.
arch/x86/include/asm/div64.h | 20 +++++++++------
include/linux/math64.h | 48 +++++++++++++++++++++++++++++++++++-
lib/math/div64.c | 14 ++++++-----
3 files changed, 67 insertions(+), 15 deletions(-)
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9931e4c7d73f..6d8a3de3f43a 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -84,21 +84,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
* Will generate an #DE when the result doesn't fit u64, could fix with an
* __ex_table[] entry when it becomes an issue.
*/
-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
+static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
{
- u64 q;
+ u64 rdx;
- asm ("mulq %2; divq %3" : "=a" (q)
- : "a" (a), "rm" (mul), "rm" (div)
- : "rdx");
+ asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
- return q;
+ if (!statically_true(!add))
+ asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
+ [lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
+
+ asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
+
+ return rax;
}
-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
{
- return mul_u64_u64_div_u64(a, mul, div);
+ return mul_u64_add_u64_div_u64(a, mul, 0, div);
}
#define mul_u64_u32_div mul_u64_u32_div
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6aaccc1626ab..e889d850b7f1 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -282,7 +282,53 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
-u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+/**
+ * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @c: unsigned 64bit addend
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * add a third value and then divide by a fourth.
+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
+ * Architecture specific code may trap on zero or overflow.
+ *
+ * Return: (@a * @b + @c) / @d
+ */
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+
+/**
+ * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide by a third value.
+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
+ * Architecture specific code may trap on zero or overflow.
+ *
+ * Return: @a * @b / @d
+ */
+#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
+
+/**
+ * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide and round up.
+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
+ * Architecture specific code may trap on zero or overflow.
+ *
+ * Return: (@a * @b + @d - 1) / @d
+ */
+#define mul_u64_u64_div_u64_roundup(a, b, d) \
+ ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
+
/**
* DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 4a4b1aa9e6e1..a88391b8ada0 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -183,13 +183,13 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
}
EXPORT_SYMBOL(iter_div_u64_rem);
-#ifndef mul_u64_u64_div_u64
-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
+#ifndef mul_u64_add_u64_div_u64
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
{
#if defined(__SIZEOF_INT128__)
/* native 64x64=128 bits multiplication */
- u128 prod = (u128)a * b;
+ u128 prod = (u128)a * b + c;
u64 n_lo = prod, n_hi = prod >> 64;
#else
@@ -198,8 +198,10 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
u64 x, y, z;
- x = (u64)a_lo * b_lo;
- y = (u64)a_lo * b_hi + (u32)(x >> 32);
+ /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
+ x = (u64)a_lo * b_lo + (u32)c;
+ y = (u64)a_lo * b_hi + (u32)(c >> 32);
+ y += (u32)(x >> 32);
z = (u64)a_hi * b_hi + (u32)(y >> 32);
y = (u64)a_hi * b_lo + (u32)y;
z += (u32)(y >> 32);
@@ -265,5 +267,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
return res;
}
-EXPORT_SYMBOL(mul_u64_u64_div_u64);
+EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
#endif
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v5 next 4/9] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()
2025-11-05 20:10 ` [PATCH v5 next 4/9] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup() David Laight
@ 2025-11-06 0:26 ` H. Peter Anvin
2025-11-06 9:52 ` David Laight
0 siblings, 1 reply; 15+ messages in thread
From: H. Peter Anvin @ 2025-11-06 0:26 UTC (permalink / raw)
To: David Laight, Andrew Morton, linux-kernel
Cc: u.kleine-koenig, Nicolas Pitre, Oleg Nesterov, Peter Zijlstra,
Biju Das, Borislav Petkov, Dave Hansen, Ingo Molnar,
Thomas Gleixner, Li RongQing, Khazhismel Kumykov, Jens Axboe, x86
On November 5, 2025 12:10:30 PM PST, David Laight <david.laight.linux@gmail.com> wrote:
>The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
>variant needs 'divisor - 1' adding in between the multiply and
>divide so cannot easily be done by a caller.
>
>Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
>and implement the 'round down' and 'round up' using it.
>
>Update the x86-64 asm to optimise for 'c' being a constant zero.
>
>Add kerndoc definitions for all three functions.
>
>Signed-off-by: David Laight <david.laight.linux@gmail.com>
>Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
>---
>
>Changes for v2 (formally patch 1/3):
>- Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
> Although I'm not convinced the path is common enough to be worth
> the two ilog2() calls.
>
>Changes for v3 (formally patch 3/4):
>- The early call to div64_u64() has been removed by patch 3.
> Pretty much guaranteed to be a pessimisation.
>
>Changes for v4:
>- For x86-64 split the multiply, add and divide into three asm blocks.
> (gcc makes a pigs breakfast of (u128)a * b + c)
>- Change the kerndoc since divide by zero will (probably) fault.
>
>Changes for v5:
>- Fix test that excludes the add/adc asm block for constant zero 'add'.
>
> arch/x86/include/asm/div64.h | 20 +++++++++------
> include/linux/math64.h | 48 +++++++++++++++++++++++++++++++++++-
> lib/math/div64.c | 14 ++++++-----
> 3 files changed, 67 insertions(+), 15 deletions(-)
>
>diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
>index 9931e4c7d73f..6d8a3de3f43a 100644
>--- a/arch/x86/include/asm/div64.h
>+++ b/arch/x86/include/asm/div64.h
>@@ -84,21 +84,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> * Will generate an #DE when the result doesn't fit u64, could fix with an
> * __ex_table[] entry when it becomes an issue.
> */
>-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
>+static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
> {
>- u64 q;
>+ u64 rdx;
>
>- asm ("mulq %2; divq %3" : "=a" (q)
>- : "a" (a), "rm" (mul), "rm" (div)
>- : "rdx");
>+ asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
>
>- return q;
>+ if (!statically_true(!add))
>+ asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
>+ [lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
>+
>+ asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
>+
>+ return rax;
> }
>-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
>+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
>
> static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
> {
>- return mul_u64_u64_div_u64(a, mul, div);
>+ return mul_u64_add_u64_div_u64(a, mul, 0, div);
> }
> #define mul_u64_u32_div mul_u64_u32_div
>
>diff --git a/include/linux/math64.h b/include/linux/math64.h
>index 6aaccc1626ab..e889d850b7f1 100644
>--- a/include/linux/math64.h
>+++ b/include/linux/math64.h
>@@ -282,7 +282,53 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
> }
> #endif /* mul_u64_u32_div */
>
>-u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
>+/**
>+ * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
>+ * @a: first unsigned 64bit multiplicand
>+ * @b: second unsigned 64bit multiplicand
>+ * @c: unsigned 64bit addend
>+ * @d: unsigned 64bit divisor
>+ *
>+ * Multiply two 64bit values together to generate a 128bit product
>+ * add a third value and then divide by a fourth.
>+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
>+ * Architecture specific code may trap on zero or overflow.
>+ *
>+ * Return: (@a * @b + @c) / @d
>+ */
>+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
>+
>+/**
>+ * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
>+ * @a: first unsigned 64bit multiplicand
>+ * @b: second unsigned 64bit multiplicand
>+ * @d: unsigned 64bit divisor
>+ *
>+ * Multiply two 64bit values together to generate a 128bit product
>+ * and then divide by a third value.
>+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
>+ * Architecture specific code may trap on zero or overflow.
>+ *
>+ * Return: @a * @b / @d
>+ */
>+#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
>+
>+/**
>+ * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
>+ * @a: first unsigned 64bit multiplicand
>+ * @b: second unsigned 64bit multiplicand
>+ * @d: unsigned 64bit divisor
>+ *
>+ * Multiply two 64bit values together to generate a 128bit product
>+ * and then divide and round up.
>+ * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
>+ * Architecture specific code may trap on zero or overflow.
>+ *
>+ * Return: (@a * @b + @d - 1) / @d
>+ */
>+#define mul_u64_u64_div_u64_roundup(a, b, d) \
>+ ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
>+
>
> /**
> * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
>diff --git a/lib/math/div64.c b/lib/math/div64.c
>index 4a4b1aa9e6e1..a88391b8ada0 100644
>--- a/lib/math/div64.c
>+++ b/lib/math/div64.c
>@@ -183,13 +183,13 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
> }
> EXPORT_SYMBOL(iter_div_u64_rem);
>
>-#ifndef mul_u64_u64_div_u64
>-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
>+#ifndef mul_u64_add_u64_div_u64
>+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
> {
> #if defined(__SIZEOF_INT128__)
>
> /* native 64x64=128 bits multiplication */
>- u128 prod = (u128)a * b;
>+ u128 prod = (u128)a * b + c;
> u64 n_lo = prod, n_hi = prod >> 64;
>
> #else
>@@ -198,8 +198,10 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
> u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
> u64 x, y, z;
>
>- x = (u64)a_lo * b_lo;
>- y = (u64)a_lo * b_hi + (u32)(x >> 32);
>+ /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
>+ x = (u64)a_lo * b_lo + (u32)c;
>+ y = (u64)a_lo * b_hi + (u32)(c >> 32);
>+ y += (u32)(x >> 32);
> z = (u64)a_hi * b_hi + (u32)(y >> 32);
> y = (u64)a_hi * b_lo + (u32)y;
> z += (u32)(y >> 32);
>@@ -265,5 +267,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
>
> return res;
> }
>-EXPORT_SYMBOL(mul_u64_u64_div_u64);
>+EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
> #endif
For the roundup case, I'm somewhat curious how this compares with doing:
cmp $1, %rdx
sbb $-1, %rax
... after the division. At least it means not needing to compute d - 1, saving an instruction as well as a register. Unfortunately using an lea instruction to compute %rax (which otherwise would incorporate both) isn't possible since it doesn't set the flags.
The cmp; sbb sequence should be no slower than add; adc – I'm saying "no slower" because %rdx is never written to, so I think this is provably a better sequence; whether or not it is measurable is another thing (but if we are tweaking this stuff...)
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v5 next 4/9] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()
2025-11-06 0:26 ` H. Peter Anvin
@ 2025-11-06 9:52 ` David Laight
0 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-06 9:52 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andrew Morton, linux-kernel, u.kleine-koenig, Nicolas Pitre,
Oleg Nesterov, Peter Zijlstra, Biju Das, Borislav Petkov,
Dave Hansen, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
On Wed, 05 Nov 2025 16:26:05 -0800
"H. Peter Anvin" <hpa@zytor.com> wrote:
> On November 5, 2025 12:10:30 PM PST, David Laight <david.laight.linux@gmail.com> wrote:
> >The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
> >variant needs 'divisor - 1' adding in between the multiply and
> >divide so cannot easily be done by a caller.
> >
> >Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
> >and implement the 'round down' and 'round up' using it.
> >
> >Update the x86-64 asm to optimise for 'c' being a constant zero.
> >
> >Add kerndoc definitions for all three functions.
> >
> >Signed-off-by: David Laight <david.laight.linux@gmail.com>
> >Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
> >---
> >
> >Changes for v2 (formally patch 1/3):
> >- Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
> > Although I'm not convinced the path is common enough to be worth
> > the two ilog2() calls.
> >
> >Changes for v3 (formally patch 3/4):
> >- The early call to div64_u64() has been removed by patch 3.
> > Pretty much guaranteed to be a pessimisation.
> >
> >Changes for v4:
> >- For x86-64 split the multiply, add and divide into three asm blocks.
> > (gcc makes a pigs breakfast of (u128)a * b + c)
> >- Change the kerndoc since divide by zero will (probably) fault.
> >
> >Changes for v5:
> >- Fix test that excludes the add/adc asm block for constant zero 'add'.
> >
> > arch/x86/include/asm/div64.h | 20 +++++++++------
> > include/linux/math64.h | 48 +++++++++++++++++++++++++++++++++++-
> > lib/math/div64.c | 14 ++++++-----
> > 3 files changed, 67 insertions(+), 15 deletions(-)
> >
> >diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
> >index 9931e4c7d73f..6d8a3de3f43a 100644
> >--- a/arch/x86/include/asm/div64.h
> >+++ b/arch/x86/include/asm/div64.h
> >@@ -84,21 +84,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> > * Will generate an #DE when the result doesn't fit u64, could fix with an
> > * __ex_table[] entry when it becomes an issue.
> > */
> >-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
> >+static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
> > {
> >- u64 q;
> >+ u64 rdx;
> >
> >- asm ("mulq %2; divq %3" : "=a" (q)
> >- : "a" (a), "rm" (mul), "rm" (div)
> >- : "rdx");
> >+ asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
> >
> >- return q;
> >+ if (!statically_true(!add))
> >+ asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
> >+ [lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
> >+
> >+ asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
> >+
> >+ return rax;
> > }
> >-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
> >+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
...
>
> For the roundup case, I'm somewhat curious how this compares with doing:
I guess you are referring to the x86-64 asm version (left above).
> cmp $1, %rdx
> sbb $-1, %rax
>
> ... after the division. At least it means not needing to compute d - 1,
> saving an instruction as well as a register.
> Unfortunately using an lea instruction to compute %rax (which otherwise
> would incorporate both) isn't possible since it doesn't set the flags.
>
> The cmp; sbb sequence should be no slower than add;
> adc – I'm saying "no slower" because %rdx is never written to,
> so I think this is provably a better sequence; whether or not it is
> measurable is another thing (but if we are tweaking this stuff...)
I wanted the same function as the non-x64-64 version and 'multiply and add'
possibly has other uses.
The instruction to calculate 'd - 1' (if not a constant) will usually
execute in parallel with an earlier instruction (eg the multiply)
so will be pretty much 'zero cost'.
The add/adc pair are in the 'register dependency chain' - so add a clock each.
The same is true for your cmp/sbb pair.
(Except on pre-broadwell Intel cpu where adc/sbb are two clocks.
I've lost the full reference, the initial changes fixed 'adc $0,x' and
generated the carry flag immediately and only delayed the result.
The doc said 'adc $0,reg' not 'adc $const,reg' - so maybe the sbb $-1
was two clocks for longer.)
David
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v5 next 5/9] lib: Add tests for mul_u64_u64_div_u64_roundup()
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
` (3 preceding siblings ...)
2025-11-05 20:10 ` [PATCH v5 next 4/9] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup() David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 20:10 ` [PATCH v5 next 6/9] lib: test_mul_u64_u64_div_u64: Test both generic and arch versions David Laight
` (3 subsequent siblings)
8 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
Replicate the existing mul_u64_u64_div_u64() test cases with round up.
Update the shell script that verifies the table, remove the comment
markers so that it can be directly pasted into a shell.
Rename the divisor from 'c' to 'd' to match mul_u64_add_u64_div_u64().
It any tests fail then fail the module load with -EINVAL.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
Changes for v3:
- Rename 'c' to 'd' to match mul_u64_add_u64_div_u64()
Changes for v4:
- Fix shell script that verifies the table
lib/math/test_mul_u64_u64_div_u64.c | 122 +++++++++++++++++-----------
1 file changed, 73 insertions(+), 49 deletions(-)
diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c
index 58d058de4e73..4d5e4e5dac67 100644
--- a/lib/math/test_mul_u64_u64_div_u64.c
+++ b/lib/math/test_mul_u64_u64_div_u64.c
@@ -10,61 +10,73 @@
#include <linux/printk.h>
#include <linux/math64.h>
-typedef struct { u64 a; u64 b; u64 c; u64 result; } test_params;
+typedef struct { u64 a; u64 b; u64 d; u64 result; uint round_up;} test_params;
static test_params test_values[] = {
/* this contains many edge values followed by a couple random values */
-{ 0xb, 0x7, 0x3, 0x19 },
-{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000 },
-{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001 },
-{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000 },
-{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000 },
-{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab },
-{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000 },
-{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff },
-{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851 },
-{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554 },
-{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3 },
-{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2 },
-{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007 },
-{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001 },
-{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001 },
-{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000 },
-{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001 },
-{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002 },
-{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8 },
-{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca },
-{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b },
-{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9 },
-{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe },
-{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18 },
-{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35 },
-{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f },
-{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961 },
-{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216 },
+{ 0xb, 0x7, 0x3, 0x19, 1 },
+{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000, 0 },
+{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001, 0 },
+{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000, 1 },
+{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000, 1 },
+{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab, 0 },
+{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000, 1 },
+{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff, 1 },
+{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851, 1 },
+{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554, 1 },
+{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3, 1 },
+{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2, 1 },
+{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007, 1 },
+{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001, 1 },
+{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001, 0 },
+{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000, 1 },
+{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001, 1 },
+{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002, 1 },
+{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8, 1 },
+{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca, 1 },
+{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b, 1 },
+{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9, 1 },
+{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe, 0 },
+{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18, 1 },
+{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35, 1 },
+{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f, 1 },
+{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961, 1 },
+{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216, 1 },
};
/*
* The above table can be verified with the following shell script:
- *
- * #!/bin/sh
- * sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4/p' \
- * lib/math/test_mul_u64_u64_div_u64.c |
- * while read a b c r; do
- * expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $c | bc )
- * given=$( printf "%X\n" $r )
- * if [ "$expected" = "$given" ]; then
- * echo "$a * $b / $c = $r OK"
- * else
- * echo "$a * $b / $c = $r is wrong" >&2
- * echo "should be equivalent to 0x$expected" >&2
- * exit 1
- * fi
- * done
+
+#!/bin/sh
+sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4 \5/p' \
+ lib/math/test_mul_u64_u64_div_u64.c |
+while read a b d r e; do
+ expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $d | bc )
+ given=$( printf "%X\n" $r )
+ if [ "$expected" = "$given" ]; then
+ echo "$a * $b / $d = $r OK"
+ else
+ echo "$a * $b / $d = $r is wrong" >&2
+ echo "should be equivalent to 0x$expected" >&2
+ exit 1
+ fi
+ expected=$( printf "obase=16; ibase=16; (%X * %X + %X) / %X\n" $a $b $((d-1)) $d | bc )
+ given=$( printf "%X\n" $((r + e)) )
+ if [ "$expected" = "$given" ]; then
+ echo "$a * $b +/ $d = $(printf '%#x' $((r + e))) OK"
+ else
+ echo "$a * $b +/ $d = $(printf '%#x' $((r + e))) is wrong" >&2
+ echo "should be equivalent to 0x$expected" >&2
+ exit 1
+ fi
+done
+
*/
static int __init test_init(void)
{
+ int errors = 0;
+ int tests = 0;
int i;
pr_info("Starting mul_u64_u64_div_u64() test\n");
@@ -72,19 +84,31 @@ static int __init test_init(void)
for (i = 0; i < ARRAY_SIZE(test_values); i++) {
u64 a = test_values[i].a;
u64 b = test_values[i].b;
- u64 c = test_values[i].c;
+ u64 d = test_values[i].d;
u64 expected_result = test_values[i].result;
- u64 result = mul_u64_u64_div_u64(a, b, c);
+ u64 result = mul_u64_u64_div_u64(a, b, d);
+ u64 result_up = mul_u64_u64_div_u64_roundup(a, b, d);
+
+ tests += 2;
if (result != expected_result) {
- pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, c);
+ pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, d);
pr_err("ERROR: expected result: %016llx\n", expected_result);
pr_err("ERROR: obtained result: %016llx\n", result);
+ errors++;
+ }
+ expected_result += test_values[i].round_up;
+ if (result_up != expected_result) {
+ pr_err("ERROR: 0x%016llx * 0x%016llx +/ 0x%016llx\n", a, b, d);
+ pr_err("ERROR: expected result: %016llx\n", expected_result);
+ pr_err("ERROR: obtained result: %016llx\n", result_up);
+ errors++;
}
}
- pr_info("Completed mul_u64_u64_div_u64() test\n");
- return 0;
+ pr_info("Completed mul_u64_u64_div_u64() test, %d tests, %d errors\n",
+ tests, errors);
+ return errors ? -EINVAL : 0;
}
static void __exit test_exit(void)
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v5 next 6/9] lib: test_mul_u64_u64_div_u64: Test both generic and arch versions
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
` (4 preceding siblings ...)
2025-11-05 20:10 ` [PATCH v5 next 5/9] lib: Add tests for mul_u64_u64_div_u64_roundup() David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 20:10 ` [PATCH v5 next 7/9] lib: mul_u64_u64_div_u64() optimise multiply on 32bit x86 David Laight
` (2 subsequent siblings)
8 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
Change the #if in div64.c so that test_mul_u64_u64_div_u64.c
can compile and test the generic version (including the 'long multiply')
on architectures (eg amd64) that define their own copy.
Test the kernel version and the locally compiled version on all arch.
Output the time taken (in ns) on the 'test completed' trace.
For reference, on my zen 5, the optimised version takes ~220ns and the
generic version ~3350ns.
Using the native multiply saves ~200ns and adding back the ilog2() 'optimisation'
test adds ~50ms.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
Changes for v4:
- Fix build on non x86 (eg arm32)
Changes for v5:
- Fix build on 32bit x86
lib/math/div64.c | 8 +++--
lib/math/test_mul_u64_u64_div_u64.c | 52 +++++++++++++++++++++++++----
2 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index a88391b8ada0..18a9ba26c418 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -177,16 +177,18 @@ EXPORT_SYMBOL(div64_s64);
* Iterative div/mod for use when dividend is not expected to be much
* bigger than divisor.
*/
+#ifndef iter_div_u64_rem
u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
return __iter_div_u64_rem(dividend, divisor, remainder);
}
EXPORT_SYMBOL(iter_div_u64_rem);
+#endif
-#ifndef mul_u64_add_u64_div_u64
+#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64)
u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
{
-#if defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
/* native 64x64=128 bits multiplication */
u128 prod = (u128)a * b + c;
@@ -267,5 +269,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
return res;
}
+#if !defined(test_mul_u64_add_u64_div_u64)
EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
#endif
+#endif
diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c
index 4d5e4e5dac67..d8d2c18c4614 100644
--- a/lib/math/test_mul_u64_u64_div_u64.c
+++ b/lib/math/test_mul_u64_u64_div_u64.c
@@ -73,21 +73,34 @@ done
*/
-static int __init test_init(void)
+static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+
+static int __init test_run(unsigned int fn_no, const char *fn_name)
{
+ u64 start_time;
int errors = 0;
int tests = 0;
int i;
- pr_info("Starting mul_u64_u64_div_u64() test\n");
+ start_time = ktime_get_ns();
for (i = 0; i < ARRAY_SIZE(test_values); i++) {
u64 a = test_values[i].a;
u64 b = test_values[i].b;
u64 d = test_values[i].d;
u64 expected_result = test_values[i].result;
- u64 result = mul_u64_u64_div_u64(a, b, d);
- u64 result_up = mul_u64_u64_div_u64_roundup(a, b, d);
+ u64 result, result_up;
+
+ switch (fn_no) {
+ default:
+ result = mul_u64_u64_div_u64(a, b, d);
+ result_up = mul_u64_u64_div_u64_roundup(a, b, d);
+ break;
+ case 1:
+ result = test_mul_u64_add_u64_div_u64(a, b, 0, d);
+ result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d);
+ break;
+ }
tests += 2;
@@ -106,15 +119,40 @@ static int __init test_init(void)
}
}
- pr_info("Completed mul_u64_u64_div_u64() test, %d tests, %d errors\n",
- tests, errors);
- return errors ? -EINVAL : 0;
+ pr_info("Completed %s() test, %d tests, %d errors, %llu ns\n",
+ fn_name, tests, errors, ktime_get_ns() - start_time);
+ return errors;
+}
+
+static int __init test_init(void)
+{
+ pr_info("Starting mul_u64_u64_div_u64() test\n");
+ if (test_run(0, "mul_u64_u64_div_u64"))
+ return -EINVAL;
+ if (test_run(1, "test_mul_u64_u64_div_u64"))
+ return -EINVAL;
+ return 0;
}
static void __exit test_exit(void)
{
}
+/* Compile the generic mul_u64_add_u64_div_u64() code */
+#undef __div64_32
+#define __div64_32 __div64_32
+#define div_s64_rem div_s64_rem
+#define div64_u64_rem div64_u64_rem
+#define div64_u64 div64_u64
+#define div64_s64 div64_s64
+#define iter_div_u64_rem iter_div_u64_rem
+
+#undef mul_u64_add_u64_div_u64
+#define mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64
+#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64
+
+#include "div64.c"
+
module_init(test_init);
module_exit(test_exit);
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v5 next 7/9] lib: mul_u64_u64_div_u64() optimise multiply on 32bit x86
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
` (5 preceding siblings ...)
2025-11-05 20:10 ` [PATCH v5 next 6/9] lib: test_mul_u64_u64_div_u64: Test both generic and arch versions David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 23:45 ` H. Peter Anvin
2025-11-05 20:10 ` [PATCH v5 next 8/9] lib: mul_u64_u64_div_u64() Optimise the divide code David Laight
2025-11-05 20:10 ` [PATCH v5 next 9/9] lib: test_mul_u64_u64_div_u64: Test the 32bit code on 64bit David Laight
8 siblings, 1 reply; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a + u32_b).
As well as the extra instructions it can generate a lot of spills to stack
(including spills of constant zeros and even multiplies by constant zero).
mul_u32_u32() already exists to optimise the multiply.
Add a similar add_u64_32() for the addition.
Disable both for clang - it generates better code without them.
Move the 64x64 => 128 multiply into a static inline helper function
for code clarity.
No need for the a/b_hi/lo variables, the implicit casts on the function
calls do the work for us.
Should have minimal effect on the generated code.
Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply
in mul_u64_add_u64_div_u64().
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
Changes for v4:
- merge in patch 8.
- Add comments about gcc being 'broken' for mixed 32/64 bit maths.
clang doesn't have the same issues.
- Use a #define for define mul_add() to avoid 'defined but not used'
errors.
arch/x86/include/asm/div64.h | 19 +++++++++++++++++
include/linux/math64.h | 11 ++++++++++
lib/math/div64.c | 40 +++++++++++++++++++++++-------------
3 files changed, 56 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 6d8a3de3f43a..30fd06ede751 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
}
#define div_u64_rem div_u64_rem
+/*
+ * gcc tends to zero extend 32bit values and do full 64bit maths.
+ * Define asm functions that avoid this.
+ * (clang generates better code for the C versions.)
+ */
+#ifndef __clang__
static inline u64 mul_u32_u32(u32 a, u32 b)
{
u32 high, low;
@@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
}
#define mul_u32_u32 mul_u32_u32
+static inline u64 add_u64_u32(u64 a, u32 b)
+{
+ u32 high = a >> 32, low = a;
+
+ asm ("addl %[b], %[low]; adcl $0, %[high]"
+ : [low] "+r" (low), [high] "+r" (high)
+ : [b] "rm" (b) );
+
+ return low | (u64)high << 32;
+}
+#define add_u64_u32 add_u64_u32
+#endif
+
/*
* __div64_32() is never called on x86, so prevent the
* generic definition from getting built.
diff --git a/include/linux/math64.h b/include/linux/math64.h
index e889d850b7f1..cc305206d89f 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -158,6 +158,17 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
}
#endif
+#ifndef add_u64_u32
+/*
+ * Many a GCC version also messes this up.
+ * Zero extending b and then spilling everything to stack.
+ */
+static inline u64 add_u64_u32(u64 a, u32 b)
+{
+ return a + b;
+}
+#endif
+
#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
#ifndef mul_u64_u32_shr
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 18a9ba26c418..bb57a48ce36a 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -186,33 +186,45 @@ EXPORT_SYMBOL(iter_div_u64_rem);
#endif
#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64)
-u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
-{
+
+#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c)
+
#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
+static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
+{
/* native 64x64=128 bits multiplication */
u128 prod = (u128)a * b + c;
- u64 n_lo = prod, n_hi = prod >> 64;
+
+ *p_lo = prod;
+ return prod >> 64;
+}
#else
- /* perform a 64x64=128 bits multiplication manually */
- u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
+static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
+{
+ /* perform a 64x64=128 bits multiplication in 32bit chunks */
u64 x, y, z;
/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
- x = (u64)a_lo * b_lo + (u32)c;
- y = (u64)a_lo * b_hi + (u32)(c >> 32);
- y += (u32)(x >> 32);
- z = (u64)a_hi * b_hi + (u32)(y >> 32);
- y = (u64)a_hi * b_lo + (u32)y;
- z += (u32)(y >> 32);
- x = (y << 32) + (u32)x;
-
- u64 n_lo = x, n_hi = z;
+ x = mul_add(a, b, c);
+ y = mul_add(a, b >> 32, c >> 32);
+ y = add_u64_u32(y, x >> 32);
+ z = mul_add(a >> 32, b >> 32, y >> 32);
+ y = mul_add(a >> 32, b, y);
+ *p_lo = (y << 32) + (u32)x;
+ return add_u64_u32(z, y >> 32);
+}
#endif
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
+{
+ u64 n_lo, n_hi;
+
+ n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c);
+
if (!n_hi)
return div64_u64(n_lo, d);
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH v5 next 7/9] lib: mul_u64_u64_div_u64() optimise multiply on 32bit x86
2025-11-05 20:10 ` [PATCH v5 next 7/9] lib: mul_u64_u64_div_u64() optimise multiply on 32bit x86 David Laight
@ 2025-11-05 23:45 ` H. Peter Anvin
2025-11-06 9:26 ` David Laight
0 siblings, 1 reply; 15+ messages in thread
From: H. Peter Anvin @ 2025-11-05 23:45 UTC (permalink / raw)
To: David Laight, Andrew Morton, linux-kernel
Cc: u.kleine-koenig, Nicolas Pitre, Oleg Nesterov, Peter Zijlstra,
Biju Das, Borislav Petkov, Dave Hansen, Ingo Molnar,
Thomas Gleixner, Li RongQing, Khazhismel Kumykov, Jens Axboe, x86
On November 5, 2025 12:10:33 PM PST, David Laight <david.laight.linux@gmail.com> wrote:
>gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a + u32_b).
>As well as the extra instructions it can generate a lot of spills to stack
>(including spills of constant zeros and even multiplies by constant zero).
>
>mul_u32_u32() already exists to optimise the multiply.
>Add a similar add_u64_32() for the addition.
>Disable both for clang - it generates better code without them.
>
>Move the 64x64 => 128 multiply into a static inline helper function
>for code clarity.
>No need for the a/b_hi/lo variables, the implicit casts on the function
>calls do the work for us.
>Should have minimal effect on the generated code.
>
>Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply
>in mul_u64_add_u64_div_u64().
>
>Signed-off-by: David Laight <david.laight.linux@gmail.com>
>Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
>---
>
>Changes for v4:
>- merge in patch 8.
>- Add comments about gcc being 'broken' for mixed 32/64 bit maths.
> clang doesn't have the same issues.
>- Use a #define for define mul_add() to avoid 'defined but not used'
> errors.
>
> arch/x86/include/asm/div64.h | 19 +++++++++++++++++
> include/linux/math64.h | 11 ++++++++++
> lib/math/div64.c | 40 +++++++++++++++++++++++-------------
> 3 files changed, 56 insertions(+), 14 deletions(-)
>
>diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
>index 6d8a3de3f43a..30fd06ede751 100644
>--- a/arch/x86/include/asm/div64.h
>+++ b/arch/x86/include/asm/div64.h
>@@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
> }
> #define div_u64_rem div_u64_rem
>
>+/*
>+ * gcc tends to zero extend 32bit values and do full 64bit maths.
>+ * Define asm functions that avoid this.
>+ * (clang generates better code for the C versions.)
>+ */
>+#ifndef __clang__
> static inline u64 mul_u32_u32(u32 a, u32 b)
> {
> u32 high, low;
>@@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> }
> #define mul_u32_u32 mul_u32_u32
>
>+static inline u64 add_u64_u32(u64 a, u32 b)
>+{
>+ u32 high = a >> 32, low = a;
>+
>+ asm ("addl %[b], %[low]; adcl $0, %[high]"
>+ : [low] "+r" (low), [high] "+r" (high)
>+ : [b] "rm" (b) );
>+
>+ return low | (u64)high << 32;
>+}
>+#define add_u64_u32 add_u64_u32
>+#endif
>+
> /*
> * __div64_32() is never called on x86, so prevent the
> * generic definition from getting built.
>diff --git a/include/linux/math64.h b/include/linux/math64.h
>index e889d850b7f1..cc305206d89f 100644
>--- a/include/linux/math64.h
>+++ b/include/linux/math64.h
>@@ -158,6 +158,17 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> }
> #endif
>
>+#ifndef add_u64_u32
>+/*
>+ * Many a GCC version also messes this up.
>+ * Zero extending b and then spilling everything to stack.
>+ */
>+static inline u64 add_u64_u32(u64 a, u32 b)
>+{
>+ return a + b;
>+}
>+#endif
>+
> #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
>
> #ifndef mul_u64_u32_shr
>diff --git a/lib/math/div64.c b/lib/math/div64.c
>index 18a9ba26c418..bb57a48ce36a 100644
>--- a/lib/math/div64.c
>+++ b/lib/math/div64.c
>@@ -186,33 +186,45 @@ EXPORT_SYMBOL(iter_div_u64_rem);
> #endif
>
> #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64)
>-u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
>-{
>+
>+#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c)
>+
> #if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
>
>+static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
>+{
> /* native 64x64=128 bits multiplication */
> u128 prod = (u128)a * b + c;
>- u64 n_lo = prod, n_hi = prod >> 64;
>+
>+ *p_lo = prod;
>+ return prod >> 64;
>+}
>
> #else
>
>- /* perform a 64x64=128 bits multiplication manually */
>- u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
>+static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
>+{
>+ /* perform a 64x64=128 bits multiplication in 32bit chunks */
> u64 x, y, z;
>
> /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
>- x = (u64)a_lo * b_lo + (u32)c;
>- y = (u64)a_lo * b_hi + (u32)(c >> 32);
>- y += (u32)(x >> 32);
>- z = (u64)a_hi * b_hi + (u32)(y >> 32);
>- y = (u64)a_hi * b_lo + (u32)y;
>- z += (u32)(y >> 32);
>- x = (y << 32) + (u32)x;
>-
>- u64 n_lo = x, n_hi = z;
>+ x = mul_add(a, b, c);
>+ y = mul_add(a, b >> 32, c >> 32);
>+ y = add_u64_u32(y, x >> 32);
>+ z = mul_add(a >> 32, b >> 32, y >> 32);
>+ y = mul_add(a >> 32, b, y);
>+ *p_lo = (y << 32) + (u32)x;
>+ return add_u64_u32(z, y >> 32);
>+}
>
> #endif
>
>+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
>+{
>+ u64 n_lo, n_hi;
>+
>+ n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c);
>+
> if (!n_hi)
> return div64_u64(n_lo, d);
>
By the way have you filed gcc bug reports for this?
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v5 next 7/9] lib: mul_u64_u64_div_u64() optimise multiply on 32bit x86
2025-11-05 23:45 ` H. Peter Anvin
@ 2025-11-06 9:26 ` David Laight
0 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-06 9:26 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andrew Morton, linux-kernel, u.kleine-koenig, Nicolas Pitre,
Oleg Nesterov, Peter Zijlstra, Biju Das, Borislav Petkov,
Dave Hansen, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
On Wed, 05 Nov 2025 15:45:29 -0800
"H. Peter Anvin" <hpa@zytor.com> wrote:
> On November 5, 2025 12:10:33 PM PST, David Laight <david.laight.linux@gmail.com> wrote:
> >gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a + u32_b).
> >As well as the extra instructions it can generate a lot of spills to stack
> >(including spills of constant zeros and even multiplies by constant zero).
> >
> >mul_u32_u32() already exists to optimise the multiply.
> >Add a similar add_u64_32() for the addition.
> >Disable both for clang - it generates better code without them.
> >
> >Move the 64x64 => 128 multiply into a static inline helper function
> >for code clarity.
> >No need for the a/b_hi/lo variables, the implicit casts on the function
> >calls do the work for us.
> >Should have minimal effect on the generated code.
> >
> >Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply
> >in mul_u64_add_u64_div_u64().
> >
> >Signed-off-by: David Laight <david.laight.linux@gmail.com>
> >Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
> >---
> >
> >Changes for v4:
> >- merge in patch 8.
> >- Add comments about gcc being 'broken' for mixed 32/64 bit maths.
> > clang doesn't have the same issues.
> >- Use a #define for define mul_add() to avoid 'defined but not used'
> > errors.
> >
> > arch/x86/include/asm/div64.h | 19 +++++++++++++++++
> > include/linux/math64.h | 11 ++++++++++
> > lib/math/div64.c | 40 +++++++++++++++++++++++-------------
> > 3 files changed, 56 insertions(+), 14 deletions(-)
> >
> >diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
> >index 6d8a3de3f43a..30fd06ede751 100644
> >--- a/arch/x86/include/asm/div64.h
> >+++ b/arch/x86/include/asm/div64.h
> >@@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
> > }
> > #define div_u64_rem div_u64_rem
> >
> >+/*
> >+ * gcc tends to zero extend 32bit values and do full 64bit maths.
> >+ * Define asm functions that avoid this.
> >+ * (clang generates better code for the C versions.)
> >+ */
> >+#ifndef __clang__
> > static inline u64 mul_u32_u32(u32 a, u32 b)
> > {
> > u32 high, low;
> >@@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> > }
> > #define mul_u32_u32 mul_u32_u32
> >
> >+static inline u64 add_u64_u32(u64 a, u32 b)
> >+{
> >+ u32 high = a >> 32, low = a;
> >+
> >+ asm ("addl %[b], %[low]; adcl $0, %[high]"
> >+ : [low] "+r" (low), [high] "+r" (high)
> >+ : [b] "rm" (b) );
> >+
> >+ return low | (u64)high << 32;
> >+}
> >+#define add_u64_u32 add_u64_u32
> >+#endif
...
>
> By the way have you filed gcc bug reports for this?
As in the need for the asm() above?
No...
I doubt one was filed when the mul version was added either.
ISTR that some very recent gcc versions were a bit better, but it depends
on minor code changes and compiler options.
I suspect that internally gcc sometimes keeps a 64bit value as two 32bit
ones, but at other times it is assigned to a 64bit internal register.
If the latter happens it always promotes a 32bit value to 64 bits and
assigns to another 64bit register.
At that point it won't split the 64bit registers - so a lot of spills to
stack happen when it tries to assign real registers.
So breath on an 'A' (dx:ax) constraint and the generated code is horrid.
Even the lo | (u64)hi << 32 can generate 'or' instructions.
The same happens for int128 on 64bit.
David
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v5 next 8/9] lib: mul_u64_u64_div_u64() Optimise the divide code
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
` (6 preceding siblings ...)
2025-11-05 20:10 ` [PATCH v5 next 7/9] lib: mul_u64_u64_div_u64() optimise multiply on 32bit x86 David Laight
@ 2025-11-05 20:10 ` David Laight
2025-11-05 20:10 ` [PATCH v5 next 9/9] lib: test_mul_u64_u64_div_u64: Test the 32bit code on 64bit David Laight
8 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
Replace the bit by bit algorithm with one that generates 16 bits
per iteration on 32bit architectures and 32 bits on 64bit ones.
On my zen 5 this reduces the time for the tests (using the generic
code) from ~3350ns to ~1000ns.
Running the 32bit algorithm on 64bit x86 takes ~1500ns.
It'll be slightly slower on a real 32bit system, mostly due
to register pressure.
The savings for 32bit x86 are much higher (tested in userspace).
The worst case (lots of bits in the quotient) drops from ~900 clocks
to ~130 (pretty much independant of the arguments).
Other 32bit architectures may see better savings.
It is possibly to optimise for divisors that span less than
__LONG_WIDTH__/2 bits. However I suspect they don't happen that often
and it doesn't remove any slow cpu divide instructions which dominate
the result.
Typical improvements for 64bit random divides:
old new
sandy bridge: 470 150
haswell: 400 144
piledriver: 960 467 I think rdpmc is very slow.
zen5: 244 80
(Timing is 'rdpmc; mul_div(); rdpmc' with the multiply depending on the
first rdpmc and the second rdpmc depending on the quotient.)
Object code (64bit x86 test program): old 0x173 new 0x141.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
Algorithm unchanged since v3.
lib/math/div64.c | 124 ++++++++++++++++++++++++++++++++---------------
1 file changed, 85 insertions(+), 39 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index bb57a48ce36a..d1e92ea24fce 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -190,7 +190,6 @@ EXPORT_SYMBOL(iter_div_u64_rem);
#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c)
#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
-
static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
{
/* native 64x64=128 bits multiplication */
@@ -199,9 +198,7 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
*p_lo = prod;
return prod >> 64;
}
-
#else
-
static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
{
/* perform a 64x64=128 bits multiplication in 32bit chunks */
@@ -216,12 +213,37 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
*p_lo = (y << 32) + (u32)x;
return add_u64_u32(z, y >> 32);
}
+#endif
+
+#ifndef BITS_PER_ITER
+#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16)
+#endif
+
+#if BITS_PER_ITER == 32
+#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c)
+#define add_u64_long(a, b) ((a) + (b))
+#else
+#undef BITS_PER_ITER
+#define BITS_PER_ITER 16
+static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c)
+{
+ u64 n_lo = mul_add(a, b, c);
+ u64 n_med = mul_add(a >> 32, b, c >> 32);
+
+ n_med = add_u64_u32(n_med, n_lo >> 32);
+ *p_lo = n_med << 32 | (u32)n_lo;
+ return n_med >> 32;
+}
+#define add_u64_long(a, b) add_u64_u32(a, b)
#endif
u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
{
- u64 n_lo, n_hi;
+ unsigned long d_msig, q_digit;
+ unsigned int reps, d_z_hi;
+ u64 quotient, n_lo, n_hi;
+ u32 overflow;
n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c);
@@ -240,46 +262,70 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
return ~0ULL;
}
- int shift = __builtin_ctzll(d);
-
- /* try reducing the fraction in case the dividend becomes <= 64 bits */
- if ((n_hi >> shift) == 0) {
- u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo;
-
- return div64_u64(n, d >> shift);
- /*
- * The remainder value if needed would be:
- * res = div64_u64_rem(n, d >> shift, &rem);
- * rem = (rem << shift) + (n_lo - (n << shift));
- */
+ /* Left align the divisor, shifting the dividend to match */
+ d_z_hi = __builtin_clzll(d);
+ if (d_z_hi) {
+ d <<= d_z_hi;
+ n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi);
+ n_lo <<= d_z_hi;
}
- /* Do the full 128 by 64 bits division */
-
- shift = __builtin_clzll(d);
- d <<= shift;
-
- int p = 64 + shift;
- u64 res = 0;
- bool carry;
+ reps = 64 / BITS_PER_ITER;
+ /* Optimise loop count for small dividends */
+ if (!(u32)(n_hi >> 32)) {
+ reps -= 32 / BITS_PER_ITER;
+ n_hi = n_hi << 32 | n_lo >> 32;
+ n_lo <<= 32;
+ }
+#if BITS_PER_ITER == 16
+ if (!(u32)(n_hi >> 48)) {
+ reps--;
+ n_hi = add_u64_u32(n_hi << 16, n_lo >> 48);
+ n_lo <<= 16;
+ }
+#endif
- do {
- carry = n_hi >> 63;
- shift = carry ? 1 : __builtin_clzll(n_hi);
- if (p < shift)
- break;
- p -= shift;
- n_hi <<= shift;
- n_hi |= n_lo >> (64 - shift);
- n_lo <<= shift;
- if (carry || (n_hi >= d)) {
- n_hi -= d;
- res |= 1ULL << p;
+ /* Invert the dividend so we can use add instead of subtract. */
+ n_lo = ~n_lo;
+ n_hi = ~n_hi;
+
+ /*
+ * Get the most significant BITS_PER_ITER bits of the divisor.
+ * This is used to get a low 'guestimate' of the quotient digit.
+ */
+ d_msig = (d >> (64 - BITS_PER_ITER)) + 1;
+
+ /*
+ * Now do a 'long division' with BITS_PER_ITER bit 'digits'.
+ * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits.
+ * The worst case is dividing ~0 by 0x8000 which requires two subtracts.
+ */
+ quotient = 0;
+ while (reps--) {
+ q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig;
+ /* Shift 'n' left to align with the product q_digit * d */
+ overflow = n_hi >> (64 - BITS_PER_ITER);
+ n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER));
+ n_lo <<= BITS_PER_ITER;
+ /* Add product to negated divisor */
+ overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi);
+ /* Adjust for the q_digit 'guestimate' being low */
+ while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) {
+ q_digit++;
+ n_hi += d;
+ overflow += n_hi < d;
}
- } while (n_hi);
- /* The remainder value if needed would be n_hi << p */
+ quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit);
+ }
- return res;
+ /*
+ * The above only ensures the remainder doesn't overflow,
+ * it can still be possible to add (aka subtract) another copy
+ * of the divisor.
+ */
+ if ((n_hi + d) > n_hi)
+ quotient++;
+ return quotient;
}
#if !defined(test_mul_u64_add_u64_div_u64)
EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH v5 next 9/9] lib: test_mul_u64_u64_div_u64: Test the 32bit code on 64bit
2025-11-05 20:10 [PATCH v5 next 0/9] Implement mul_u64_u64_div_u64_roundup() David Laight
` (7 preceding siblings ...)
2025-11-05 20:10 ` [PATCH v5 next 8/9] lib: mul_u64_u64_div_u64() Optimise the divide code David Laight
@ 2025-11-05 20:10 ` David Laight
8 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2025-11-05 20:10 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: David Laight, u.kleine-koenig, Nicolas Pitre, Oleg Nesterov,
Peter Zijlstra, Biju Das, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Li RongQing,
Khazhismel Kumykov, Jens Axboe, x86
There are slight differences in the mul_u64_add_u64_div_u64() code
between 32bit and 64bit systems.
Compile and test the 32bit version on 64bit hosts for better test
coverage.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
---
Changes for v4:
- Fix build on non x86-64
lib/math/test_mul_u64_u64_div_u64.c | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c
index d8d2c18c4614..338d014f0c73 100644
--- a/lib/math/test_mul_u64_u64_div_u64.c
+++ b/lib/math/test_mul_u64_u64_div_u64.c
@@ -74,6 +74,10 @@ done
*/
static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+#if __LONG_WIDTH__ >= 64
+#define TEST_32BIT_DIV
+static u64 test_mul_u64_add_u64_div_u64_32bit(u64 a, u64 b, u64 c, u64 d);
+#endif
static int __init test_run(unsigned int fn_no, const char *fn_name)
{
@@ -100,6 +104,12 @@ static int __init test_run(unsigned int fn_no, const char *fn_name)
result = test_mul_u64_add_u64_div_u64(a, b, 0, d);
result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d);
break;
+#ifdef TEST_32BIT_DIV
+ case 2:
+ result = test_mul_u64_add_u64_div_u64_32bit(a, b, 0, d);
+ result_up = test_mul_u64_add_u64_div_u64_32bit(a, b, d - 1, d);
+ break;
+#endif
}
tests += 2;
@@ -131,6 +141,10 @@ static int __init test_init(void)
return -EINVAL;
if (test_run(1, "test_mul_u64_u64_div_u64"))
return -EINVAL;
+#ifdef TEST_32BIT_DIV
+ if (test_run(2, "test_mul_u64_u64_div_u64_32bit"))
+ return -EINVAL;
+#endif
return 0;
}
@@ -153,6 +167,21 @@ static void __exit test_exit(void)
#include "div64.c"
+#ifdef TEST_32BIT_DIV
+/* Recompile the generic code for 32bit long */
+#undef test_mul_u64_add_u64_div_u64
+#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64_32bit
+#undef BITS_PER_ITER
+#define BITS_PER_ITER 16
+
+#define mul_u64_u64_add_u64 mul_u64_u64_add_u64_32bit
+#undef mul_u64_long_add_u64
+#undef add_u64_long
+#undef mul_add
+
+#include "div64.c"
+#endif
+
module_init(test_init);
module_exit(test_exit);
--
2.39.5
^ permalink raw reply related [flat|nested] 15+ messages in thread