[PATCH v4 0/4] simplify do_div() with constant divisor

linux-arch.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 0/4] simplify do_div() with constant divisor
@ 2024-10-03 21:16 Nicolas Pitre
  2024-10-03 21:16 ` [PATCH v4 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Nicolas Pitre @ 2024-10-03 21:16 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

While working on mul_u64_u64_div_u64() improvements I realized that there
is a better way to perform a 64x64->128 bits multiplication with overflow
handling.

Change from v3:

- Added timings to commit log of patch #4.

Link to v3: https://lore.kernel.org/lkml/20240708012749.2098373-2-nico@fluxnic.net/T/

Change from v2:

- Fix last minute edit screw-up (missing one function return type).

Link to v2: https://lore.kernel.org/lkml/20240707171919.1951895-1-nico@fluxnic.net/

Changes from v1:

- Formalize condition for when overflow handling can be skipped.
- Make this condition apply only if it can be determined at compile time
  (guard against the compiler not always inling code).
- Keep the ARM assembly but apply the above changes to it as well.
- Force __always_inline when optimizing for performance.
- Augment test_div64.c with important edge cases.

Link to v1: https://lore.kernel.org/lkml/20240705022334.1378363-1-nico@fluxnic.net/

The diffstat is:

 arch/arm/include/asm/div64.h |  13 +++-
 include/asm-generic/div64.h  | 121 ++++++++++++-----------------------
 lib/math/test_div64.c        |  85 +++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 85 deletions(-)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32()
  2024-10-03 21:16 [PATCH v4 0/4] simplify do_div() with constant divisor Nicolas Pitre
@ 2024-10-03 21:16 ` Nicolas Pitre
  2024-10-03 21:16 ` [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Nicolas Pitre @ 2024-10-03 21:16 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Be sure to test the extreme cases with and without bias.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 lib/math/test_div64.c | 85 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/lib/math/test_div64.c b/lib/math/test_div64.c
index c15edd688d..3cd699b654 100644
--- a/lib/math/test_div64.c
+++ b/lib/math/test_div64.c
@@ -26,6 +26,9 @@ static const u64 test_div64_dividends[] = {
 	0x0072db27380dd689,
 	0x0842f488162e2284,
 	0xf66745411d8ab063,
+	0xfffffffffffffffb,
+	0xfffffffffffffffc,
+	0xffffffffffffffff,
 };
 #define SIZE_DIV64_DIVIDENDS ARRAY_SIZE(test_div64_dividends)
 
@@ -37,7 +40,10 @@ static const u64 test_div64_dividends[] = {
 #define TEST_DIV64_DIVISOR_5 0x0008a880
 #define TEST_DIV64_DIVISOR_6 0x003fd3ae
 #define TEST_DIV64_DIVISOR_7 0x0b658fac
-#define TEST_DIV64_DIVISOR_8 0xdc08b349
+#define TEST_DIV64_DIVISOR_8 0x80000001
+#define TEST_DIV64_DIVISOR_9 0xdc08b349
+#define TEST_DIV64_DIVISOR_A 0xfffffffe
+#define TEST_DIV64_DIVISOR_B 0xffffffff
 
 static const u32 test_div64_divisors[] = {
 	TEST_DIV64_DIVISOR_0,
@@ -49,13 +55,16 @@ static const u32 test_div64_divisors[] = {
 	TEST_DIV64_DIVISOR_6,
 	TEST_DIV64_DIVISOR_7,
 	TEST_DIV64_DIVISOR_8,
+	TEST_DIV64_DIVISOR_9,
+	TEST_DIV64_DIVISOR_A,
+	TEST_DIV64_DIVISOR_B,
 };
 #define SIZE_DIV64_DIVISORS ARRAY_SIZE(test_div64_divisors)
 
 static const struct {
 	u64 quotient;
 	u32 remainder;
-} test_div64_results[SIZE_DIV64_DIVISORS][SIZE_DIV64_DIVIDENDS] = {
+} test_div64_results[SIZE_DIV64_DIVIDENDS][SIZE_DIV64_DIVISORS] = {
 	{
 		{ 0x0000000013045e47, 0x00000001 },
 		{ 0x000000000161596c, 0x00000030 },
@@ -65,6 +74,9 @@ static const struct {
 		{ 0x00000000000013c4, 0x0004ce80 },
 		{ 0x00000000000002ae, 0x001e143c },
 		{ 0x000000000000000f, 0x0033e56c },
+		{ 0x0000000000000001, 0x2b27507f },
+		{ 0x0000000000000000, 0xab275080 },
+		{ 0x0000000000000000, 0xab275080 },
 		{ 0x0000000000000000, 0xab275080 },
 	}, {
 		{ 0x00000001c45c02d1, 0x00000000 },
@@ -75,7 +87,10 @@ static const struct {
 		{ 0x000000000001d637, 0x0004e5d9 },
 		{ 0x0000000000003fc9, 0x000713bb },
 		{ 0x0000000000000165, 0x029abe7d },
+		{ 0x000000000000001f, 0x673c193a },
 		{ 0x0000000000000012, 0x6e9f7e37 },
+		{ 0x000000000000000f, 0xe73c1977 },
+		{ 0x000000000000000f, 0xe73c1968 },
 	}, {
 		{ 0x000000197a3a0cf7, 0x00000002 },
 		{ 0x00000001d9632e5c, 0x00000021 },
@@ -85,7 +100,10 @@ static const struct {
 		{ 0x00000000001a7bb3, 0x00072331 },
 		{ 0x00000000000397ad, 0x0002c61b },
 		{ 0x000000000000141e, 0x06ea2e89 },
+		{ 0x00000000000001ca, 0x4c0a72e7 },
 		{ 0x000000000000010a, 0xab002ad7 },
+		{ 0x00000000000000e5, 0x4c0a767b },
+		{ 0x00000000000000e5, 0x4c0a7596 },
 	}, {
 		{ 0x0000017949e37538, 0x00000001 },
 		{ 0x0000001b62441f37, 0x00000055 },
@@ -95,7 +113,10 @@ static const struct {
 		{ 0x0000000001882ec6, 0x0005cbf9 },
 		{ 0x000000000035333b, 0x0017abdf },
 		{ 0x00000000000129f1, 0x0ab4520d },
+		{ 0x0000000000001a87, 0x18ff0472 },
 		{ 0x0000000000000f6e, 0x8ac0ce9b },
+		{ 0x0000000000000d43, 0x98ff397f },
+		{ 0x0000000000000d43, 0x98ff2c3c },
 	}, {
 		{ 0x000011f321a74e49, 0x00000006 },
 		{ 0x0000014d8481d211, 0x0000005b },
@@ -105,7 +126,10 @@ static const struct {
 		{ 0x0000000012a88828, 0x00036c97 },
 		{ 0x000000000287f16f, 0x002c2a25 },
 		{ 0x00000000000e2cc7, 0x02d581e3 },
+		{ 0x0000000000014318, 0x2ee07d7f },
 		{ 0x000000000000bbf4, 0x1ba08c03 },
+		{ 0x000000000000a18c, 0x2ee303af },
+		{ 0x000000000000a18c, 0x2ee26223 },
 	}, {
 		{ 0x0000d8db8f72935d, 0x00000005 },
 		{ 0x00000fbd5aed7a2e, 0x00000002 },
@@ -115,7 +139,10 @@ static const struct {
 		{ 0x00000000e16b20fa, 0x0002a14a },
 		{ 0x000000001e940d22, 0x00353b2e },
 		{ 0x0000000000ab40ac, 0x06fba6ba },
+		{ 0x00000000000f3f70, 0x0af7eeda },
 		{ 0x000000000008debd, 0x72d98365 },
+		{ 0x0000000000079fb8, 0x0b166dba },
+		{ 0x0000000000079fb8, 0x0b0ece02 },
 	}, {
 		{ 0x000cc3045b8fc281, 0x00000000 },
 		{ 0x0000ed1f48b5c9fc, 0x00000079 },
@@ -125,7 +152,10 @@ static const struct {
 		{ 0x0000000d43fce827, 0x00082b09 },
 		{ 0x00000001ccaba11a, 0x0037e8dd },
 		{ 0x000000000a13f729, 0x0566dffd },
+		{ 0x0000000000e5b64e, 0x3728203b },
 		{ 0x000000000085a14b, 0x23d36726 },
+		{ 0x000000000072db27, 0x38f38cd7 },
+		{ 0x000000000072db27, 0x3880b1b0 },
 	}, {
 		{ 0x00eafeb9c993592b, 0x00000001 },
 		{ 0x00110e5befa9a991, 0x00000048 },
@@ -135,7 +165,10 @@ static const struct {
 		{ 0x000000f4459740fc, 0x00084484 },
 		{ 0x0000002122c47bf9, 0x002ca446 },
 		{ 0x00000000b9936290, 0x004979c4 },
+		{ 0x000000001085e910, 0x05a83974 },
 		{ 0x00000000099ca89d, 0x9db446bf },
+		{ 0x000000000842f488, 0x26b40b94 },
+		{ 0x000000000842f488, 0x1e71170c },
 	}, {
 		{ 0x1b60cece589da1d2, 0x00000001 },
 		{ 0x01fcb42be1453f5b, 0x0000004f },
@@ -145,7 +178,49 @@ static const struct {
 		{ 0x00001c757dfab350, 0x00048863 },
 		{ 0x000003dc4979c652, 0x00224ea7 },
 		{ 0x000000159edc3144, 0x06409ab3 },
+		{ 0x00000001ecce8a7e, 0x30bc25e5 },
 		{ 0x000000011eadfee3, 0xa99c48a8 },
+		{ 0x00000000f6674543, 0x0a593ae9 },
+		{ 0x00000000f6674542, 0x13f1f5a5 },
+	}, {
+		{ 0x1c71c71c71c71c71, 0x00000002 },
+		{ 0x0210842108421084, 0x0000000b },
+		{ 0x007f01fc07f01fc0, 0x000000fb },
+		{ 0x00014245eabf1f9a, 0x0000a63d },
+		{ 0x0000ffffffffffff, 0x0000fffb },
+		{ 0x00001d913cecc509, 0x0007937b },
+		{ 0x00000402c70c678f, 0x0005bfc9 },
+		{ 0x00000016766cb70b, 0x045edf97 },
+		{ 0x00000001fffffffb, 0x80000000 },
+		{ 0x0000000129d84b3a, 0xa2e8fe71 },
+		{ 0x0000000100000001, 0xfffffffd },
+		{ 0x0000000100000000, 0xfffffffb },
+	}, {
+		{ 0x1c71c71c71c71c71, 0x00000003 },
+		{ 0x0210842108421084, 0x0000000c },
+		{ 0x007f01fc07f01fc0, 0x000000fc },
+		{ 0x00014245eabf1f9a, 0x0000a63e },
+		{ 0x0000ffffffffffff, 0x0000fffc },
+		{ 0x00001d913cecc509, 0x0007937c },
+		{ 0x00000402c70c678f, 0x0005bfca },
+		{ 0x00000016766cb70b, 0x045edf98 },
+		{ 0x00000001fffffffc, 0x00000000 },
+		{ 0x0000000129d84b3a, 0xa2e8fe72 },
+		{ 0x0000000100000002, 0x00000000 },
+		{ 0x0000000100000000, 0xfffffffc },
+	}, {
+		{ 0x1c71c71c71c71c71, 0x00000006 },
+		{ 0x0210842108421084, 0x0000000f },
+		{ 0x007f01fc07f01fc0, 0x000000ff },
+		{ 0x00014245eabf1f9a, 0x0000a641 },
+		{ 0x0000ffffffffffff, 0x0000ffff },
+		{ 0x00001d913cecc509, 0x0007937f },
+		{ 0x00000402c70c678f, 0x0005bfcd },
+		{ 0x00000016766cb70b, 0x045edf9b },
+		{ 0x00000001fffffffc, 0x00000003 },
+		{ 0x0000000129d84b3a, 0xa2e8fe75 },
+		{ 0x0000000100000002, 0x00000003 },
+		{ 0x0000000100000001, 0x00000000 },
 	},
 };
 
@@ -208,6 +283,12 @@ static bool __init test_div64(void)
 			return false;
 		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_8, i, 8))
 			return false;
+		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_9, i, 9))
+			return false;
+		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_A, i, 10))
+			return false;
+		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_B, i, 11))
+			return false;
 		for (j = 0; j < SIZE_DIV64_DIVISORS; j++) {
 			if (!test_div64_one(dividend, test_div64_divisors[j],
 					    i, j))
-- 
2.46.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32()
  2024-10-03 21:16 [PATCH v4 0/4] simplify do_div() with constant divisor Nicolas Pitre
  2024-10-03 21:16 ` [PATCH v4 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
@ 2024-10-03 21:16 ` Nicolas Pitre
  2024-10-04 22:47   ` kernel test robot
  2024-10-03 21:16 ` [PATCH v4 3/4] ARM: div64: improve __arch_xprod_64() Nicolas Pitre
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 9+ messages in thread
From: Nicolas Pitre @ 2024-10-03 21:16 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Several years later I just realized that this code could be greatly
simplified.

First, let's formalize the need for overflow handling in
__arch_xprod64(). Assuming n = UINT64_MAX, there are 2 cases where
an overflow may occur:

1) If a bias must be added, we have m_lo * n_lo + m or
   m_lo * 0xffffffff + ((m_hi << 32) + m_lo) or
   ((m_lo << 32) - m_lo) + ((m_hi << 32) + m_lo) or
   (m_lo + m_hi) << 32 which must be < (1 << 64). So the criteria for no
   overflow is m_lo + m_hi < (1 << 32).

2) The cross product m_lo * n_hi + m_hi * n_lo or
   m_lo * 0xffffffff + m_hi * 0xffffffff or
   ((m_lo << 32) - m_lo) + ((m_hi << 32) - m_hi). Assuming the top
   result from the previous step (m_lo + m_hi) that must be added to
   this, we get (m_lo + m_hi) << 32 again.

So let's have a straight and simpler version when this is true.
Otherwise some reordering allows for taking care of possible overflows
without any actual conditionals. And prevent from generating both code
variants by making sure this is considered only if m is perceived as
constant by the compiler.

This, in turn, allows for greatly simplifying __div64_const32(). The
"special case" may go as well as the regular case works just fine
without needing a bias. Then reduction should be applied all the time as
minimizing m is the key.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 include/asm-generic/div64.h | 114 +++++++++++-------------------------
 1 file changed, 35 insertions(+), 79 deletions(-)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 13f5aa68a4..5d59cf7e73 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -74,7 +74,8 @@
 	 * do the trick here).						\
 	 */								\
 	uint64_t ___res, ___x, ___t, ___m, ___n = (n);			\
-	uint32_t ___p, ___bias;						\
+	uint32_t ___p;							\
+	bool ___bias = false;						\
 									\
 	/* determine MSB of b */					\
 	___p = 1 << ilog2(___b);					\
@@ -87,22 +88,14 @@
 	___x = ~0ULL / ___b * ___b - 1;					\
 									\
 	/* test our ___m with res = m * x / (p << 64) */		\
-	___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32;	\
-	___t = ___res += (___m & 0xffffffff) * (___x >> 32);		\
-	___res += (___x & 0xffffffff) * (___m >> 32);			\
-	___t = (___res < ___t) ? (1ULL << 32) : 0;			\
-	___res = (___res >> 32) + ___t;					\
-	___res += (___m >> 32) * (___x >> 32);				\
-	___res /= ___p;							\
+	___res = (___m & 0xffffffff) * (___x & 0xffffffff);		\
+	___t = (___m & 0xffffffff) * (___x >> 32) + (___res >> 32);	\
+	___res = (___m >> 32) * (___x >> 32) + (___t >> 32);		\
+	___t = (___m >> 32) * (___x & 0xffffffff) + (___t & 0xffffffff);\
+	___res = (___res + (___t >> 32)) / ___p;			\
 									\
-	/* Now sanitize and optimize what we've got. */			\
-	if (~0ULL % (___b / (___b & -___b)) == 0) {			\
-		/* special case, can be simplified to ... */		\
-		___n /= (___b & -___b);					\
-		___m = ~0ULL / (___b / (___b & -___b));			\
-		___p = 1;						\
-		___bias = 1;						\
-	} else if (___res != ___x / ___b) {				\
+	/* Now validate what we've got. */				\
+	if (___res != ___x / ___b) {					\
 		/*							\
 		 * We can't get away without a bias to compensate	\
 		 * for bit truncation errors.  To avoid it we'd need an	\
@@ -111,45 +104,18 @@
 		 *							\
 		 * Instead we do m = p / b and n / b = (n * m + m) / p.	\
 		 */							\
-		___bias = 1;						\
+		___bias = true;						\
 		/* Compute m = (p << 64) / b */				\
 		___m = (~0ULL / ___b) * ___p;				\
 		___m += ((~0ULL % ___b + 1) * ___p) / ___b;		\
-	} else {							\
-		/*							\
-		 * Reduce m / p, and try to clear bit 31 of m when	\
-		 * possible, otherwise that'll need extra overflow	\
-		 * handling later.					\
-		 */							\
-		uint32_t ___bits = -(___m & -___m);			\
-		___bits |= ___m >> 32;					\
-		___bits = (~___bits) << 1;				\
-		/*							\
-		 * If ___bits == 0 then setting bit 31 is  unavoidable.	\
-		 * Simply apply the maximum possible reduction in that	\
-		 * case. Otherwise the MSB of ___bits indicates the	\
-		 * best reduction we should apply.			\
-		 */							\
-		if (!___bits) {						\
-			___p /= (___m & -___m);				\
-			___m /= (___m & -___m);				\
-		} else {						\
-			___p >>= ilog2(___bits);			\
-			___m >>= ilog2(___bits);			\
-		}							\
-		/* No bias needed. */					\
-		___bias = 0;						\
 	}								\
 									\
+	/* Reduce m / p to help avoid overflow handling later. */	\
+	___p /= (___m & -___m);						\
+	___m /= (___m & -___m);						\
+									\
 	/*								\
-	 * Now we have a combination of 2 conditions:			\
-	 *								\
-	 * 1) whether or not we need to apply a bias, and		\
-	 *								\
-	 * 2) whether or not there might be an overflow in the cross	\
-	 *    product determined by (___m & ((1 << 63) | (1 << 31))).	\
-	 *								\
-	 * Select the best way to do (m_bias + m * n) / (1 << 64).	\
+	 * Perform (m_bias + m * n) / (1 << 64).			\
 	 * From now on there will be actual runtime code generated.	\
 	 */								\
 	___res = __arch_xprod_64(___m, ___n, ___bias);			\
@@ -165,7 +131,7 @@
  * Semantic:  retval = ((bias ? m : 0) + m * n) >> 64
  *
  * The product is a 128-bit value, scaled down to 64 bits.
- * Assuming constant propagation to optimize away unused conditional code.
+ * Hoping for compile-time optimization of  conditional code.
  * Architectures may provide their own optimized assembly implementation.
  */
 static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
@@ -174,38 +140,28 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
 	uint32_t m_hi = m >> 32;
 	uint32_t n_lo = n;
 	uint32_t n_hi = n >> 32;
-	uint64_t res;
-	uint32_t res_lo, res_hi, tmp;
-
-	if (!bias) {
-		res = ((uint64_t)m_lo * n_lo) >> 32;
-	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
-		/* there can't be any overflow here */
-		res = (m + (uint64_t)m_lo * n_lo) >> 32;
-	} else {
-		res = m + (uint64_t)m_lo * n_lo;
-		res_lo = res >> 32;
-		res_hi = (res_lo < m_hi);
-		res = res_lo | ((uint64_t)res_hi << 32);
-	}
-
-	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
-		/* there can't be any overflow here */
-		res += (uint64_t)m_lo * n_hi;
-		res += (uint64_t)m_hi * n_lo;
-		res >>= 32;
+	uint64_t x, y;
+
+	/* Determine if overflow handling can be dispensed with. */
+	bool no_ovf = __builtin_constant_p(m) &&
+		      ((m >> 32) + (m & 0xffffffff) < 0x100000000);
+
+	if (no_ovf) {
+		x = (uint64_t)m_lo * n_lo + (bias ? m : 0);
+		x >>= 32;
+		x += (uint64_t)m_lo * n_hi;
+		x += (uint64_t)m_hi * n_lo;
+		x >>= 32;
+		x += (uint64_t)m_hi * n_hi;
 	} else {
-		res += (uint64_t)m_lo * n_hi;
-		tmp = res >> 32;
-		res += (uint64_t)m_hi * n_lo;
-		res_lo = res >> 32;
-		res_hi = (res_lo < tmp);
-		res = res_lo | ((uint64_t)res_hi << 32);
+		x = (uint64_t)m_lo * n_lo + (bias ? m_lo : 0);
+		y = (uint64_t)m_lo * n_hi + (uint32_t)(x >> 32) + (bias ? m_hi : 0);
+		x = (uint64_t)m_hi * n_hi + (uint32_t)(y >> 32);
+		y = (uint64_t)m_hi * n_lo + (uint32_t)y;
+		x += (uint32_t)(y >> 32);
 	}
 
-	res += (uint64_t)m_hi * n_hi;
-
-	return res;
+	return x;
 }
 #endif
 
-- 
2.46.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 3/4] ARM: div64: improve __arch_xprod_64()
  2024-10-03 21:16 [PATCH v4 0/4] simplify do_div() with constant divisor Nicolas Pitre
  2024-10-03 21:16 ` [PATCH v4 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
  2024-10-03 21:16 ` [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
@ 2024-10-03 21:16 ` Nicolas Pitre
  2024-10-03 21:16 ` [PATCH v4 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
  2024-10-04 13:25 ` [PATCH v4 0/4] simplify do_div() with constant divisor Arnd Bergmann
  4 siblings, 0 replies; 9+ messages in thread
From: Nicolas Pitre @ 2024-10-03 21:16 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Let's use the same criteria for overflow handling necessity as the
generic code.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 arch/arm/include/asm/div64.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 4b69cf8504..562d5376ae 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -56,6 +56,8 @@ static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 {
 	unsigned long long res;
 	register unsigned int tmp asm("ip") = 0;
+	bool no_ovf = __builtin_constant_p(m) &&
+		      ((m >> 32) + (m & 0xffffffff) < 0x100000000);
 
 	if (!bias) {
 		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
@@ -63,7 +65,7 @@ static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 			: "=&r" (res)
 			: "r" (m), "r" (n)
 			: "cc");
-	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+	} else if (no_ovf) {
 		res = m;
 		asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
 			"mov	%Q0, #0"
@@ -80,7 +82,7 @@ static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 			: "cc");
 	}
 
-	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+	if (no_ovf) {
 		asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
 			"umlal	%R0, %Q0, %Q1, %R2\n\t"
 			"mov	%R0, #0\n\t"
-- 
2.46.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-10-03 21:16 [PATCH v4 0/4] simplify do_div() with constant divisor Nicolas Pitre
                   ` (2 preceding siblings ...)
  2024-10-03 21:16 ` [PATCH v4 3/4] ARM: div64: improve __arch_xprod_64() Nicolas Pitre
@ 2024-10-03 21:16 ` Nicolas Pitre
  2024-10-04 13:25 ` [PATCH v4 0/4] simplify do_div() with constant divisor Arnd Bergmann
  4 siblings, 0 replies; 9+ messages in thread
From: Nicolas Pitre @ 2024-10-03 21:16 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Recent gcc versions started not systematically inline __arch_xprod64()
and that has performance implications. Give the compiler the freedom to
decide only when optimizing for size.

Here's some timing numbers from lib/math/test_div64.c

Using __always_inline:

```
test_div64: Starting 64bit/32bit division and modulo test
test_div64: Completed 64bit/32bit division and modulo test, 0.048285584s elapsed
```

Without __always_inline:

```
test_div64: Starting 64bit/32bit division and modulo test
test_div64: Completed 64bit/32bit division and modulo test, 0.053023584s elapsed
```

Forcing constant base through the non-constant base code path:

```
test_div64: Starting 64bit/32bit division and modulo test
test_div64: Completed 64bit/32bit division and modulo test, 0.103263776s elapsed
```

It is worth noting that test_div64 does half the test with non constant
divisors already so the impact is greater than what those numbers show.
And for what it is worth, those numbers were obtained using QEMU. The
gcc version is 14.1.0.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 arch/arm/include/asm/div64.h | 7 ++++++-
 include/asm-generic/div64.h  | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 562d5376ae..d3ef8e416b 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -52,7 +52,12 @@ static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
 
 #else
 
-static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+static __always_inline
+#else
+static inline
+#endif
+uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 {
 	unsigned long long res;
 	register unsigned int tmp asm("ip") = 0;
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 5d59cf7e73..25e7b4b58d 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -134,7 +134,12 @@
  * Hoping for compile-time optimization of  conditional code.
  * Architectures may provide their own optimized assembly implementation.
  */
-static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+static __always_inline
+#else
+static inline
+#endif
+uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
 {
 	uint32_t m_lo = m;
 	uint32_t m_hi = m >> 32;
-- 
2.46.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 0/4] simplify do_div() with constant divisor
  2024-10-03 21:16 [PATCH v4 0/4] simplify do_div() with constant divisor Nicolas Pitre
                   ` (3 preceding siblings ...)
  2024-10-03 21:16 ` [PATCH v4 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
@ 2024-10-04 13:25 ` Arnd Bergmann
  2024-10-26  0:36   ` Nicolas Pitre
  4 siblings, 1 reply; 9+ messages in thread
From: Arnd Bergmann @ 2024-10-04 13:25 UTC (permalink / raw)
  To: Nicolas Pitre, Russell King; +Cc: Nicolas Pitre, Linux-Arch, linux-kernel

On Thu, Oct 3, 2024, at 21:16, Nicolas Pitre wrote:
> While working on mul_u64_u64_div_u64() improvements I realized that there
> is a better way to perform a 64x64->128 bits multiplication with overflow
> handling.
>
> Change from v3:
>
> - Added timings to commit log of patch #4.
>
> Link to v3: 
> https://lore.kernel.org/lkml/20240708012749.2098373-2-nico@fluxnic.net/T/

Looks good to me. There are no other comments, I would apply this
in the asm-generic tree for 6.13.

       Arnd

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32()
  2024-10-03 21:16 ` [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
@ 2024-10-04 22:47   ` kernel test robot
  0 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2024-10-04 22:47 UTC (permalink / raw)
  To: Nicolas Pitre, Arnd Bergmann, Russell King
  Cc: oe-kbuild-all, Nicolas Pitre, linux-arch, linux-kernel

Hi Nicolas,

kernel test robot noticed the following build warnings:

[auto build test WARNING on arnd-asm-generic/master]
[also build test WARNING on soc/for-next linus/master arm/for-next arm/fixes v6.12-rc1 next-20241004]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nicolas-Pitre/lib-math-test_div64-add-some-edge-cases-relevant-to-__div64_const32/20241004-052015
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
patch link:    https://lore.kernel.org/r/20241003211829.2750436-3-nico%40fluxnic.net
patch subject: [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32()
config: arm-u8500_defconfig (https://download.01.org/0day-ci/archive/20241005/202410050606.G2mm4qbj-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 14.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241005/202410050606.G2mm4qbj-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410050606.G2mm4qbj-lkp@intel.com/

All warnings (new ones prefixed by >>):

   security/keys/proc.c: In function 'proc_keys_show':
   security/keys/proc.c:217:45: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=]
     217 |                         sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7));
         |                                             ^
   security/keys/proc.c:217:25: note: 'sprintf' output between 3 and 17 bytes into a destination of size 16
     217 |                         sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7));
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   security/keys/proc.c:215:45: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=]
     215 |                         sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24));
         |                                             ^
   security/keys/proc.c:215:25: note: 'sprintf' output between 3 and 17 bytes into a destination of size 16
     215 |                         sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24));
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   security/keys/proc.c:213:44: warning: 'h' directive writing 1 byte into a region of size between 0 and 15 [-Wformat-overflow=]
     213 |                         sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60));
         |                                            ^
   security/keys/proc.c:213:25: note: 'sprintf' output between 3 and 18 bytes into a destination of size 16
     213 |                         sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60));
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> security/keys/proc.c:211:40: warning: '%llu' directive writing between 1 and 18 bytes into a region of size 16 [-Wformat-overflow=]
     211 |                         sprintf(xbuf, "%llum", div_u64(timo, 60));
         |                                        ^~~~
   security/keys/proc.c:211:39: note: directive argument in the range [0, 576460752303423487]
     211 |                         sprintf(xbuf, "%llum", div_u64(timo, 60));
         |                                       ^~~~~~~
   security/keys/proc.c:211:25: note: 'sprintf' output between 3 and 20 bytes into a destination of size 16
     211 |                         sprintf(xbuf, "%llum", div_u64(timo, 60));
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +211 security/keys/proc.c

^1da177e4c3f41 Linus Torvalds 2005-04-16  152  
^1da177e4c3f41 Linus Torvalds 2005-04-16  153  static int proc_keys_show(struct seq_file *m, void *v)
^1da177e4c3f41 Linus Torvalds 2005-04-16  154  {
^1da177e4c3f41 Linus Torvalds 2005-04-16  155  	struct rb_node *_p = v;
^1da177e4c3f41 Linus Torvalds 2005-04-16  156  	struct key *key = rb_entry(_p, struct key, serial_node);
ab5c69f01313c8 Eric Biggers   2017-09-27  157  	unsigned long flags;
927942aabbbe50 David Howells  2010-06-11  158  	key_ref_t key_ref, skey_ref;
074d58989569b3 Baolin Wang    2017-11-15  159  	time64_t now, expiry;
03dab869b7b239 David Howells  2016-10-26  160  	char xbuf[16];
363b02dab09b32 David Howells  2017-10-04  161  	short state;
074d58989569b3 Baolin Wang    2017-11-15  162  	u64 timo;
06ec7be557a125 Michael LeMay  2006-06-26  163  	int rc;
06ec7be557a125 Michael LeMay  2006-06-26  164  
4bdf0bc3003141 David Howells  2013-09-24  165  	struct keyring_search_context ctx = {
ede0fa98a900e6 Eric Biggers   2019-02-22  166  		.index_key		= key->index_key,
4aa68e07d84556 Eric Biggers   2017-09-18  167  		.cred			= m->file->f_cred,
462919591a1791 David Howells  2014-09-16  168  		.match_data.cmp		= lookup_user_key_possessed,
462919591a1791 David Howells  2014-09-16  169  		.match_data.raw_data	= key,
462919591a1791 David Howells  2014-09-16  170  		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
dcf49dbc8077e2 David Howells  2019-06-26  171  		.flags			= (KEYRING_SEARCH_NO_STATE_CHECK |
dcf49dbc8077e2 David Howells  2019-06-26  172  					   KEYRING_SEARCH_RECURSE),
4bdf0bc3003141 David Howells  2013-09-24  173  	};
4bdf0bc3003141 David Howells  2013-09-24  174  
028db3e290f15a Linus Torvalds 2019-07-10  175  	key_ref = make_key_ref(key, 0);
927942aabbbe50 David Howells  2010-06-11  176  
927942aabbbe50 David Howells  2010-06-11  177  	/* determine if the key is possessed by this process (a test we can
927942aabbbe50 David Howells  2010-06-11  178  	 * skip if the key does not indicate the possessor can view it
927942aabbbe50 David Howells  2010-06-11  179  	 */
028db3e290f15a Linus Torvalds 2019-07-10  180  	if (key->perm & KEY_POS_VIEW) {
028db3e290f15a Linus Torvalds 2019-07-10  181  		rcu_read_lock();
e59428f721ee09 David Howells  2019-06-19  182  		skey_ref = search_cred_keyrings_rcu(&ctx);
028db3e290f15a Linus Torvalds 2019-07-10  183  		rcu_read_unlock();
927942aabbbe50 David Howells  2010-06-11  184  		if (!IS_ERR(skey_ref)) {
927942aabbbe50 David Howells  2010-06-11  185  			key_ref_put(skey_ref);
927942aabbbe50 David Howells  2010-06-11  186  			key_ref = make_key_ref(key, 1);
927942aabbbe50 David Howells  2010-06-11  187  		}
927942aabbbe50 David Howells  2010-06-11  188  	}
927942aabbbe50 David Howells  2010-06-11  189  
4aa68e07d84556 Eric Biggers   2017-09-18  190  	/* check whether the current task is allowed to view the key */
f5895943d91b41 David Howells  2014-03-14  191  	rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW);
06ec7be557a125 Michael LeMay  2006-06-26  192  	if (rc < 0)
028db3e290f15a Linus Torvalds 2019-07-10  193  		return 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16  194  
074d58989569b3 Baolin Wang    2017-11-15  195  	now = ktime_get_real_seconds();
^1da177e4c3f41 Linus Torvalds 2005-04-16  196  
028db3e290f15a Linus Torvalds 2019-07-10  197  	rcu_read_lock();
028db3e290f15a Linus Torvalds 2019-07-10  198  
^1da177e4c3f41 Linus Torvalds 2005-04-16  199  	/* come up with a suitable timeout value */
ab5c69f01313c8 Eric Biggers   2017-09-27  200  	expiry = READ_ONCE(key->expiry);
39299bdd254668 David Howells  2023-12-09  201  	if (expiry == TIME64_MAX) {
^1da177e4c3f41 Linus Torvalds 2005-04-16  202  		memcpy(xbuf, "perm", 5);
074d58989569b3 Baolin Wang    2017-11-15  203  	} else if (now >= expiry) {
^1da177e4c3f41 Linus Torvalds 2005-04-16  204  		memcpy(xbuf, "expd", 5);
7b1b9164598286 David Howells  2009-09-02  205  	} else {
074d58989569b3 Baolin Wang    2017-11-15  206  		timo = expiry - now;
^1da177e4c3f41 Linus Torvalds 2005-04-16  207  
^1da177e4c3f41 Linus Torvalds 2005-04-16  208  		if (timo < 60)
074d58989569b3 Baolin Wang    2017-11-15  209  			sprintf(xbuf, "%llus", timo);
^1da177e4c3f41 Linus Torvalds 2005-04-16  210  		else if (timo < 60*60)
074d58989569b3 Baolin Wang    2017-11-15 @211  			sprintf(xbuf, "%llum", div_u64(timo, 60));
^1da177e4c3f41 Linus Torvalds 2005-04-16  212  		else if (timo < 60*60*24)
074d58989569b3 Baolin Wang    2017-11-15  213  			sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60));
^1da177e4c3f41 Linus Torvalds 2005-04-16  214  		else if (timo < 60*60*24*7)
074d58989569b3 Baolin Wang    2017-11-15 @215  			sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24));
^1da177e4c3f41 Linus Torvalds 2005-04-16  216  		else
074d58989569b3 Baolin Wang    2017-11-15  217  			sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7));
^1da177e4c3f41 Linus Torvalds 2005-04-16  218  	}
^1da177e4c3f41 Linus Torvalds 2005-04-16  219  
363b02dab09b32 David Howells  2017-10-04  220  	state = key_read_state(key);
363b02dab09b32 David Howells  2017-10-04  221  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 0/4] simplify do_div() with constant divisor
  2024-10-04 13:25 ` [PATCH v4 0/4] simplify do_div() with constant divisor Arnd Bergmann
@ 2024-10-26  0:36   ` Nicolas Pitre
  2024-10-26 10:04     ` Arnd Bergmann
  0 siblings, 1 reply; 9+ messages in thread
From: Nicolas Pitre @ 2024-10-26  0:36 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Russell King, Linux-Arch, linux-kernel

On Fri, 4 Oct 2024, Arnd Bergmann wrote:

> On Thu, Oct 3, 2024, at 21:16, Nicolas Pitre wrote:
> > While working on mul_u64_u64_div_u64() improvements I realized that there
> > is a better way to perform a 64x64->128 bits multiplication with overflow
> > handling.
> >
> > Change from v3:
> >
> > - Added timings to commit log of patch #4.
> >
> > Link to v3: 
> > https://lore.kernel.org/lkml/20240708012749.2098373-2-nico@fluxnic.net/T/
> 
> Looks good to me. There are no other comments, I would apply this
> in the asm-generic tree for 6.13.

Hello Arnd, still no sign of those patches in the asm-generic tree nor 
the linux-next tree.


Nicolas

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 0/4] simplify do_div() with constant divisor
  2024-10-26  0:36   ` Nicolas Pitre
@ 2024-10-26 10:04     ` Arnd Bergmann
  0 siblings, 0 replies; 9+ messages in thread
From: Arnd Bergmann @ 2024-10-26 10:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Russell King, Linux-Arch, linux-kernel

On Sat, Oct 26, 2024, at 00:36, Nicolas Pitre wrote:
> On Fri, 4 Oct 2024, Arnd Bergmann wrote:
>
>> On Thu, Oct 3, 2024, at 21:16, Nicolas Pitre wrote:
>> > While working on mul_u64_u64_div_u64() improvements I realized that there
>> > is a better way to perform a 64x64->128 bits multiplication with overflow
>> > handling.
>> >
>> > Change from v3:
>> >
>> > - Added timings to commit log of patch #4.
>> >
>> > Link to v3: 
>> > https://lore.kernel.org/lkml/20240708012749.2098373-2-nico@fluxnic.net/T/
>> 
>> Looks good to me. There are no other comments, I would apply this
>> in the asm-generic tree for 6.13.
>
> Hello Arnd, still no sign of those patches in the asm-generic tree nor 
> the linux-next tree.

Thanks for the reminder, I've applied it now.

     Arnd

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-10-26 10:04 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-03 21:16 [PATCH v4 0/4] simplify do_div() with constant divisor Nicolas Pitre
2024-10-03 21:16 ` [PATCH v4 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
2024-10-03 21:16 ` [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
2024-10-04 22:47   ` kernel test robot
2024-10-03 21:16 ` [PATCH v4 3/4] ARM: div64: improve __arch_xprod_64() Nicolas Pitre
2024-10-03 21:16 ` [PATCH v4 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
2024-10-04 13:25 ` [PATCH v4 0/4] simplify do_div() with constant divisor Arnd Bergmann
2024-10-26  0:36   ` Nicolas Pitre
2024-10-26 10:04     ` Arnd Bergmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).