public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] do_div() with constant divisor simplification
@ 2024-07-07 17:17 Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-07 17:17 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

While working on mul_u64_u64_div_u64() improvements I realized that there
is a better way to perform a 64x64->128 bits multiplication with overflow
handling. This is not as lean as v1 of the series but still much better
than the existing code IMHO.

Changes from v1:

- Formalize condition for when overflow handling can be skipped.
- Make this condition apply only if it can be determined at compile time
  (beware of the compiler not always inling code).
- Keep the ARM assembly but apply the above changes to it as well.
- Simplify generic C code for __arch_xprod64() further.
- Force __always_inline when optimizing for performance.
- Augment test_div64.c with important edge cases.

Link to v1: https://lore.kernel.org/lkml/20240705022334.1378363-1-nico@fluxnic.net/

The diffstat is:

 arch/arm/include/asm/div64.h |  13 +++-
 include/asm-generic/div64.h  | 121 ++++++++++++-----------------------
 lib/math/test_div64.c        |  85 +++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 85 deletions(-)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32()
  2024-07-07 17:17 [PATCH v2 0/4] do_div() with constant divisor simplification Nicolas Pitre
@ 2024-07-07 17:17 ` Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-07 17:17 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Be sure to test the extreme cases with and without bias.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 lib/math/test_div64.c | 85 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/lib/math/test_div64.c b/lib/math/test_div64.c
index c15edd688d..3cd699b654 100644
--- a/lib/math/test_div64.c
+++ b/lib/math/test_div64.c
@@ -26,6 +26,9 @@ static const u64 test_div64_dividends[] = {
 	0x0072db27380dd689,
 	0x0842f488162e2284,
 	0xf66745411d8ab063,
+	0xfffffffffffffffb,
+	0xfffffffffffffffc,
+	0xffffffffffffffff,
 };
 #define SIZE_DIV64_DIVIDENDS ARRAY_SIZE(test_div64_dividends)
 
@@ -37,7 +40,10 @@ static const u64 test_div64_dividends[] = {
 #define TEST_DIV64_DIVISOR_5 0x0008a880
 #define TEST_DIV64_DIVISOR_6 0x003fd3ae
 #define TEST_DIV64_DIVISOR_7 0x0b658fac
-#define TEST_DIV64_DIVISOR_8 0xdc08b349
+#define TEST_DIV64_DIVISOR_8 0x80000001
+#define TEST_DIV64_DIVISOR_9 0xdc08b349
+#define TEST_DIV64_DIVISOR_A 0xfffffffe
+#define TEST_DIV64_DIVISOR_B 0xffffffff
 
 static const u32 test_div64_divisors[] = {
 	TEST_DIV64_DIVISOR_0,
@@ -49,13 +55,16 @@ static const u32 test_div64_divisors[] = {
 	TEST_DIV64_DIVISOR_6,
 	TEST_DIV64_DIVISOR_7,
 	TEST_DIV64_DIVISOR_8,
+	TEST_DIV64_DIVISOR_9,
+	TEST_DIV64_DIVISOR_A,
+	TEST_DIV64_DIVISOR_B,
 };
 #define SIZE_DIV64_DIVISORS ARRAY_SIZE(test_div64_divisors)
 
 static const struct {
 	u64 quotient;
 	u32 remainder;
-} test_div64_results[SIZE_DIV64_DIVISORS][SIZE_DIV64_DIVIDENDS] = {
+} test_div64_results[SIZE_DIV64_DIVIDENDS][SIZE_DIV64_DIVISORS] = {
 	{
 		{ 0x0000000013045e47, 0x00000001 },
 		{ 0x000000000161596c, 0x00000030 },
@@ -65,6 +74,9 @@ static const struct {
 		{ 0x00000000000013c4, 0x0004ce80 },
 		{ 0x00000000000002ae, 0x001e143c },
 		{ 0x000000000000000f, 0x0033e56c },
+		{ 0x0000000000000001, 0x2b27507f },
+		{ 0x0000000000000000, 0xab275080 },
+		{ 0x0000000000000000, 0xab275080 },
 		{ 0x0000000000000000, 0xab275080 },
 	}, {
 		{ 0x00000001c45c02d1, 0x00000000 },
@@ -75,7 +87,10 @@ static const struct {
 		{ 0x000000000001d637, 0x0004e5d9 },
 		{ 0x0000000000003fc9, 0x000713bb },
 		{ 0x0000000000000165, 0x029abe7d },
+		{ 0x000000000000001f, 0x673c193a },
 		{ 0x0000000000000012, 0x6e9f7e37 },
+		{ 0x000000000000000f, 0xe73c1977 },
+		{ 0x000000000000000f, 0xe73c1968 },
 	}, {
 		{ 0x000000197a3a0cf7, 0x00000002 },
 		{ 0x00000001d9632e5c, 0x00000021 },
@@ -85,7 +100,10 @@ static const struct {
 		{ 0x00000000001a7bb3, 0x00072331 },
 		{ 0x00000000000397ad, 0x0002c61b },
 		{ 0x000000000000141e, 0x06ea2e89 },
+		{ 0x00000000000001ca, 0x4c0a72e7 },
 		{ 0x000000000000010a, 0xab002ad7 },
+		{ 0x00000000000000e5, 0x4c0a767b },
+		{ 0x00000000000000e5, 0x4c0a7596 },
 	}, {
 		{ 0x0000017949e37538, 0x00000001 },
 		{ 0x0000001b62441f37, 0x00000055 },
@@ -95,7 +113,10 @@ static const struct {
 		{ 0x0000000001882ec6, 0x0005cbf9 },
 		{ 0x000000000035333b, 0x0017abdf },
 		{ 0x00000000000129f1, 0x0ab4520d },
+		{ 0x0000000000001a87, 0x18ff0472 },
 		{ 0x0000000000000f6e, 0x8ac0ce9b },
+		{ 0x0000000000000d43, 0x98ff397f },
+		{ 0x0000000000000d43, 0x98ff2c3c },
 	}, {
 		{ 0x000011f321a74e49, 0x00000006 },
 		{ 0x0000014d8481d211, 0x0000005b },
@@ -105,7 +126,10 @@ static const struct {
 		{ 0x0000000012a88828, 0x00036c97 },
 		{ 0x000000000287f16f, 0x002c2a25 },
 		{ 0x00000000000e2cc7, 0x02d581e3 },
+		{ 0x0000000000014318, 0x2ee07d7f },
 		{ 0x000000000000bbf4, 0x1ba08c03 },
+		{ 0x000000000000a18c, 0x2ee303af },
+		{ 0x000000000000a18c, 0x2ee26223 },
 	}, {
 		{ 0x0000d8db8f72935d, 0x00000005 },
 		{ 0x00000fbd5aed7a2e, 0x00000002 },
@@ -115,7 +139,10 @@ static const struct {
 		{ 0x00000000e16b20fa, 0x0002a14a },
 		{ 0x000000001e940d22, 0x00353b2e },
 		{ 0x0000000000ab40ac, 0x06fba6ba },
+		{ 0x00000000000f3f70, 0x0af7eeda },
 		{ 0x000000000008debd, 0x72d98365 },
+		{ 0x0000000000079fb8, 0x0b166dba },
+		{ 0x0000000000079fb8, 0x0b0ece02 },
 	}, {
 		{ 0x000cc3045b8fc281, 0x00000000 },
 		{ 0x0000ed1f48b5c9fc, 0x00000079 },
@@ -125,7 +152,10 @@ static const struct {
 		{ 0x0000000d43fce827, 0x00082b09 },
 		{ 0x00000001ccaba11a, 0x0037e8dd },
 		{ 0x000000000a13f729, 0x0566dffd },
+		{ 0x0000000000e5b64e, 0x3728203b },
 		{ 0x000000000085a14b, 0x23d36726 },
+		{ 0x000000000072db27, 0x38f38cd7 },
+		{ 0x000000000072db27, 0x3880b1b0 },
 	}, {
 		{ 0x00eafeb9c993592b, 0x00000001 },
 		{ 0x00110e5befa9a991, 0x00000048 },
@@ -135,7 +165,10 @@ static const struct {
 		{ 0x000000f4459740fc, 0x00084484 },
 		{ 0x0000002122c47bf9, 0x002ca446 },
 		{ 0x00000000b9936290, 0x004979c4 },
+		{ 0x000000001085e910, 0x05a83974 },
 		{ 0x00000000099ca89d, 0x9db446bf },
+		{ 0x000000000842f488, 0x26b40b94 },
+		{ 0x000000000842f488, 0x1e71170c },
 	}, {
 		{ 0x1b60cece589da1d2, 0x00000001 },
 		{ 0x01fcb42be1453f5b, 0x0000004f },
@@ -145,7 +178,49 @@ static const struct {
 		{ 0x00001c757dfab350, 0x00048863 },
 		{ 0x000003dc4979c652, 0x00224ea7 },
 		{ 0x000000159edc3144, 0x06409ab3 },
+		{ 0x00000001ecce8a7e, 0x30bc25e5 },
 		{ 0x000000011eadfee3, 0xa99c48a8 },
+		{ 0x00000000f6674543, 0x0a593ae9 },
+		{ 0x00000000f6674542, 0x13f1f5a5 },
+	}, {
+		{ 0x1c71c71c71c71c71, 0x00000002 },
+		{ 0x0210842108421084, 0x0000000b },
+		{ 0x007f01fc07f01fc0, 0x000000fb },
+		{ 0x00014245eabf1f9a, 0x0000a63d },
+		{ 0x0000ffffffffffff, 0x0000fffb },
+		{ 0x00001d913cecc509, 0x0007937b },
+		{ 0x00000402c70c678f, 0x0005bfc9 },
+		{ 0x00000016766cb70b, 0x045edf97 },
+		{ 0x00000001fffffffb, 0x80000000 },
+		{ 0x0000000129d84b3a, 0xa2e8fe71 },
+		{ 0x0000000100000001, 0xfffffffd },
+		{ 0x0000000100000000, 0xfffffffb },
+	}, {
+		{ 0x1c71c71c71c71c71, 0x00000003 },
+		{ 0x0210842108421084, 0x0000000c },
+		{ 0x007f01fc07f01fc0, 0x000000fc },
+		{ 0x00014245eabf1f9a, 0x0000a63e },
+		{ 0x0000ffffffffffff, 0x0000fffc },
+		{ 0x00001d913cecc509, 0x0007937c },
+		{ 0x00000402c70c678f, 0x0005bfca },
+		{ 0x00000016766cb70b, 0x045edf98 },
+		{ 0x00000001fffffffc, 0x00000000 },
+		{ 0x0000000129d84b3a, 0xa2e8fe72 },
+		{ 0x0000000100000002, 0x00000000 },
+		{ 0x0000000100000000, 0xfffffffc },
+	}, {
+		{ 0x1c71c71c71c71c71, 0x00000006 },
+		{ 0x0210842108421084, 0x0000000f },
+		{ 0x007f01fc07f01fc0, 0x000000ff },
+		{ 0x00014245eabf1f9a, 0x0000a641 },
+		{ 0x0000ffffffffffff, 0x0000ffff },
+		{ 0x00001d913cecc509, 0x0007937f },
+		{ 0x00000402c70c678f, 0x0005bfcd },
+		{ 0x00000016766cb70b, 0x045edf9b },
+		{ 0x00000001fffffffc, 0x00000003 },
+		{ 0x0000000129d84b3a, 0xa2e8fe75 },
+		{ 0x0000000100000002, 0x00000003 },
+		{ 0x0000000100000001, 0x00000000 },
 	},
 };
 
@@ -208,6 +283,12 @@ static bool __init test_div64(void)
 			return false;
 		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_8, i, 8))
 			return false;
+		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_9, i, 9))
+			return false;
+		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_A, i, 10))
+			return false;
+		if (!test_div64_one(dividend, TEST_DIV64_DIVISOR_B, i, 11))
+			return false;
 		for (j = 0; j < SIZE_DIV64_DIVISORS; j++) {
 			if (!test_div64_one(dividend, test_div64_divisors[j],
 					    i, j))
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 2/4] asm-generic/div64: optimize/simplify __div64_const32()
  2024-07-07 17:17 [PATCH v2 0/4] do_div() with constant divisor simplification Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
@ 2024-07-07 17:17 ` Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 3/4] ARM: div64: improve __arch_xprod_64() Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
  3 siblings, 0 replies; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-07 17:17 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Several years later I just realized that this code could be greatly
simplified.

First, let's formalize the need for overflow handling in
__arch_xprod64(). Assuming n = UINT64_MAX, there are 2 cases where
an overflow may occur:

1) If a bias must be added, we have m_lo * n_lo + m or
   m_lo * 0xffffffff + ((m_hi << 32) + m_lo) or
   ((m_lo << 32) - m_lo) + ((m_hi << 32) + m_lo) or
   (m_lo + m_hi) << 32 which must be < (1 << 64). So the criteria for no
   overflow is m_lo + m_hi < (1 << 32).

2) The cross product m_lo * n_hi + m_hi * n_lo or
   m_lo * 0xffffffff + m_hi * 0xffffffff or
   ((m_lo << 32) - m_lo) + ((m_hi << 32) - m_hi). Assuming the top
   result from the previous step (m_lo + m_hi) that must be added to
   this, we get (m_lo + m_hi) << 32 again.

So let's have a straight and simpler version when this is true.
Otherwise some reordering allows for taking care of possible overflows
without any actual conditionals. And prevent from generating both code
variants by making sure this is considered only if m is perceived as
constant by the compiler.

This, in turn, allows for greatly simplifying __div64_const32(). The
"special case" may go as well as the regular case works just fine
without needing a bias. Then reduction should be applied all the time as
minimizing m is the key.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 include/asm-generic/div64.h | 114 +++++++++++-------------------------
 1 file changed, 35 insertions(+), 79 deletions(-)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 13f5aa68a4..5d59cf7e73 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -74,7 +74,8 @@
 	 * do the trick here).						\
 	 */								\
 	uint64_t ___res, ___x, ___t, ___m, ___n = (n);			\
-	uint32_t ___p, ___bias;						\
+	uint32_t ___p;							\
+	bool ___bias = false;						\
 									\
 	/* determine MSB of b */					\
 	___p = 1 << ilog2(___b);					\
@@ -87,22 +88,14 @@
 	___x = ~0ULL / ___b * ___b - 1;					\
 									\
 	/* test our ___m with res = m * x / (p << 64) */		\
-	___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32;	\
-	___t = ___res += (___m & 0xffffffff) * (___x >> 32);		\
-	___res += (___x & 0xffffffff) * (___m >> 32);			\
-	___t = (___res < ___t) ? (1ULL << 32) : 0;			\
-	___res = (___res >> 32) + ___t;					\
-	___res += (___m >> 32) * (___x >> 32);				\
-	___res /= ___p;							\
+	___res = (___m & 0xffffffff) * (___x & 0xffffffff);		\
+	___t = (___m & 0xffffffff) * (___x >> 32) + (___res >> 32);	\
+	___res = (___m >> 32) * (___x >> 32) + (___t >> 32);		\
+	___t = (___m >> 32) * (___x & 0xffffffff) + (___t & 0xffffffff);\
+	___res = (___res + (___t >> 32)) / ___p;			\
 									\
-	/* Now sanitize and optimize what we've got. */			\
-	if (~0ULL % (___b / (___b & -___b)) == 0) {			\
-		/* special case, can be simplified to ... */		\
-		___n /= (___b & -___b);					\
-		___m = ~0ULL / (___b / (___b & -___b));			\
-		___p = 1;						\
-		___bias = 1;						\
-	} else if (___res != ___x / ___b) {				\
+	/* Now validate what we've got. */				\
+	if (___res != ___x / ___b) {					\
 		/*							\
 		 * We can't get away without a bias to compensate	\
 		 * for bit truncation errors.  To avoid it we'd need an	\
@@ -111,45 +104,18 @@
 		 *							\
 		 * Instead we do m = p / b and n / b = (n * m + m) / p.	\
 		 */							\
-		___bias = 1;						\
+		___bias = true;						\
 		/* Compute m = (p << 64) / b */				\
 		___m = (~0ULL / ___b) * ___p;				\
 		___m += ((~0ULL % ___b + 1) * ___p) / ___b;		\
-	} else {							\
-		/*							\
-		 * Reduce m / p, and try to clear bit 31 of m when	\
-		 * possible, otherwise that'll need extra overflow	\
-		 * handling later.					\
-		 */							\
-		uint32_t ___bits = -(___m & -___m);			\
-		___bits |= ___m >> 32;					\
-		___bits = (~___bits) << 1;				\
-		/*							\
-		 * If ___bits == 0 then setting bit 31 is  unavoidable.	\
-		 * Simply apply the maximum possible reduction in that	\
-		 * case. Otherwise the MSB of ___bits indicates the	\
-		 * best reduction we should apply.			\
-		 */							\
-		if (!___bits) {						\
-			___p /= (___m & -___m);				\
-			___m /= (___m & -___m);				\
-		} else {						\
-			___p >>= ilog2(___bits);			\
-			___m >>= ilog2(___bits);			\
-		}							\
-		/* No bias needed. */					\
-		___bias = 0;						\
 	}								\
 									\
+	/* Reduce m / p to help avoid overflow handling later. */	\
+	___p /= (___m & -___m);						\
+	___m /= (___m & -___m);						\
+									\
 	/*								\
-	 * Now we have a combination of 2 conditions:			\
-	 *								\
-	 * 1) whether or not we need to apply a bias, and		\
-	 *								\
-	 * 2) whether or not there might be an overflow in the cross	\
-	 *    product determined by (___m & ((1 << 63) | (1 << 31))).	\
-	 *								\
-	 * Select the best way to do (m_bias + m * n) / (1 << 64).	\
+	 * Perform (m_bias + m * n) / (1 << 64).			\
 	 * From now on there will be actual runtime code generated.	\
 	 */								\
 	___res = __arch_xprod_64(___m, ___n, ___bias);			\
@@ -165,7 +131,7 @@
  * Semantic:  retval = ((bias ? m : 0) + m * n) >> 64
  *
  * The product is a 128-bit value, scaled down to 64 bits.
- * Assuming constant propagation to optimize away unused conditional code.
+ * Hoping for compile-time optimization of  conditional code.
  * Architectures may provide their own optimized assembly implementation.
  */
 static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
@@ -174,38 +140,28 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
 	uint32_t m_hi = m >> 32;
 	uint32_t n_lo = n;
 	uint32_t n_hi = n >> 32;
-	uint64_t res;
-	uint32_t res_lo, res_hi, tmp;
-
-	if (!bias) {
-		res = ((uint64_t)m_lo * n_lo) >> 32;
-	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
-		/* there can't be any overflow here */
-		res = (m + (uint64_t)m_lo * n_lo) >> 32;
-	} else {
-		res = m + (uint64_t)m_lo * n_lo;
-		res_lo = res >> 32;
-		res_hi = (res_lo < m_hi);
-		res = res_lo | ((uint64_t)res_hi << 32);
-	}
-
-	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
-		/* there can't be any overflow here */
-		res += (uint64_t)m_lo * n_hi;
-		res += (uint64_t)m_hi * n_lo;
-		res >>= 32;
+	uint64_t x, y;
+
+	/* Determine if overflow handling can be dispensed with. */
+	bool no_ovf = __builtin_constant_p(m) &&
+		      ((m >> 32) + (m & 0xffffffff) < 0x100000000);
+
+	if (no_ovf) {
+		x = (uint64_t)m_lo * n_lo + (bias ? m : 0);
+		x >>= 32;
+		x += (uint64_t)m_lo * n_hi;
+		x += (uint64_t)m_hi * n_lo;
+		x >>= 32;
+		x += (uint64_t)m_hi * n_hi;
 	} else {
-		res += (uint64_t)m_lo * n_hi;
-		tmp = res >> 32;
-		res += (uint64_t)m_hi * n_lo;
-		res_lo = res >> 32;
-		res_hi = (res_lo < tmp);
-		res = res_lo | ((uint64_t)res_hi << 32);
+		x = (uint64_t)m_lo * n_lo + (bias ? m_lo : 0);
+		y = (uint64_t)m_lo * n_hi + (uint32_t)(x >> 32) + (bias ? m_hi : 0);
+		x = (uint64_t)m_hi * n_hi + (uint32_t)(y >> 32);
+		y = (uint64_t)m_hi * n_lo + (uint32_t)y;
+		x += (uint32_t)(y >> 32);
 	}
 
-	res += (uint64_t)m_hi * n_hi;
-
-	return res;
+	return x;
 }
 #endif
 
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 3/4] ARM: div64: improve __arch_xprod_64()
  2024-07-07 17:17 [PATCH v2 0/4] do_div() with constant divisor simplification Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
@ 2024-07-07 17:17 ` Nicolas Pitre
  2024-07-07 17:17 ` [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
  3 siblings, 0 replies; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-07 17:17 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Let's use the same criteria for overflow handling necessity as the
generic code.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 arch/arm/include/asm/div64.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 4b69cf8504..562d5376ae 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -56,6 +56,8 @@ static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 {
 	unsigned long long res;
 	register unsigned int tmp asm("ip") = 0;
+	bool no_ovf = __builtin_constant_p(m) &&
+		      ((m >> 32) + (m & 0xffffffff) < 0x100000000);
 
 	if (!bias) {
 		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
@@ -63,7 +65,7 @@ static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 			: "=&r" (res)
 			: "r" (m), "r" (n)
 			: "cc");
-	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+	} else if (no_ovf) {
 		res = m;
 		asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
 			"mov	%Q0, #0"
@@ -80,7 +82,7 @@ static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 			: "cc");
 	}
 
-	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+	if (no_ovf) {
 		asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
 			"umlal	%R0, %Q0, %Q1, %R2\n\t"
 			"mov	%R0, #0\n\t"
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 17:17 [PATCH v2 0/4] do_div() with constant divisor simplification Nicolas Pitre
                   ` (2 preceding siblings ...)
  2024-07-07 17:17 ` [PATCH v2 3/4] ARM: div64: improve __arch_xprod_64() Nicolas Pitre
@ 2024-07-07 17:17 ` Nicolas Pitre
  2024-07-07 18:59   ` Arnd Bergmann
                     ` (2 more replies)
  3 siblings, 3 replies; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-07 17:17 UTC (permalink / raw)
  To: Arnd Bergmann, Russell King; +Cc: Nicolas Pitre, linux-arch, linux-kernel

From: Nicolas Pitre <npitre@baylibre.com>

Recent gcc versions started not systematically inline __arch_xprod64()
and that has performance implications. Give the compiler the freedom to
decide only when optimizing for size.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 arch/arm/include/asm/div64.h | 7 ++++++-
 include/asm-generic/div64.h  | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 562d5376ae..3912bf4ce9 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -52,7 +52,12 @@ static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
 
 #else
 
-static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+static __always_inline
+#else
+static inline
+#endif
+__arch_xprod_64(uint64_t m, uint64_t n, bool bias)
 {
 	unsigned long long res;
 	register unsigned int tmp asm("ip") = 0;
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 5d59cf7e73..25e7b4b58d 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -134,7 +134,12 @@
  * Hoping for compile-time optimization of  conditional code.
  * Architectures may provide their own optimized assembly implementation.
  */
-static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+static __always_inline
+#else
+static inline
+#endif
+uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
 {
 	uint32_t m_lo = m;
 	uint32_t m_hi = m >> 32;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 17:17 ` [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
@ 2024-07-07 18:59   ` Arnd Bergmann
  2024-07-07 19:14     ` Nicolas Pitre
  2024-07-07 19:40   ` kernel test robot
  2024-07-07 19:40   ` kernel test robot
  2 siblings, 1 reply; 12+ messages in thread
From: Arnd Bergmann @ 2024-07-07 18:59 UTC (permalink / raw)
  To: Nicolas Pitre, Russell King; +Cc: Nicolas Pitre, Linux-Arch, linux-kernel

On Sun, Jul 7, 2024, at 19:17, Nicolas Pitre wrote:
> From: Nicolas Pitre <npitre@baylibre.com>
>
> Recent gcc versions started not systematically inline __arch_xprod64()
> and that has performance implications. Give the compiler the freedom to
> decide only when optimizing for size.
>
> Signed-off-by: Nicolas Pitre <npitre@baylibre.com>

Seems reasonable. Just to make sure: do you know if the non-inline
version of xprod_64 ends up producing a more effecient division
result than the __do_div64() code path on arch/arm?

     Arnd

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 18:59   ` Arnd Bergmann
@ 2024-07-07 19:14     ` Nicolas Pitre
  2024-07-07 19:37       ` Arnd Bergmann
  0 siblings, 1 reply; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-07 19:14 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Russell King, Linux-Arch, linux-kernel

On Sun, 7 Jul 2024, Arnd Bergmann wrote:

> On Sun, Jul 7, 2024, at 19:17, Nicolas Pitre wrote:
> > From: Nicolas Pitre <npitre@baylibre.com>
> >
> > Recent gcc versions started not systematically inline __arch_xprod64()
> > and that has performance implications. Give the compiler the freedom to
> > decide only when optimizing for size.
> >
> > Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
> 
> Seems reasonable. Just to make sure: do you know if the non-inline
> version of xprod_64 ends up producing a more effecient division
> result than the __do_div64() code path on arch/arm?

__arch_xprod_64() is part of the __do_div64() code path. So I'm not sure 
of your question.

Obviously, having __arch_xprod_64() inlined is faster but it increases 
binary size.


Nicolas

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 19:14     ` Nicolas Pitre
@ 2024-07-07 19:37       ` Arnd Bergmann
  2024-07-08  1:21         ` Nicolas Pitre
  0 siblings, 1 reply; 12+ messages in thread
From: Arnd Bergmann @ 2024-07-07 19:37 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Russell King, Linux-Arch, linux-kernel

On Sun, Jul 7, 2024, at 21:14, Nicolas Pitre wrote:
> On Sun, 7 Jul 2024, Arnd Bergmann wrote:
>
>> On Sun, Jul 7, 2024, at 19:17, Nicolas Pitre wrote:
>> > From: Nicolas Pitre <npitre@baylibre.com>
>> >
>> > Recent gcc versions started not systematically inline __arch_xprod64()
>> > and that has performance implications. Give the compiler the freedom to
>> > decide only when optimizing for size.
>> >
>> > Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
>> 
>> Seems reasonable. Just to make sure: do you know if the non-inline
>> version of xprod_64 ends up producing a more effecient division
>> result than the __do_div64() code path on arch/arm?
>
> __arch_xprod_64() is part of the __do_div64() code path. So I'm not sure 
> of your question.
>
> Obviously, having __arch_xprod_64() inlined is faster but it increases 
> binary size.

I meant whether calling __div64_const32->__arch_xprod_64() is
still faster for a constant base when the new __arch_xprod_64()
is out of line, compared to the __div64_32->__do_div64()
assembly code path we take for a non-constant base.

       Arnd

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 17:17 ` [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
  2024-07-07 18:59   ` Arnd Bergmann
@ 2024-07-07 19:40   ` kernel test robot
  2024-07-07 19:40   ` kernel test robot
  2 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2024-07-07 19:40 UTC (permalink / raw)
  To: Nicolas Pitre, Arnd Bergmann, Russell King
  Cc: llvm, oe-kbuild-all, Nicolas Pitre, linux-arch, linux-kernel

Hi Nicolas,

kernel test robot noticed the following build errors:

[auto build test ERROR on arnd-asm-generic/master]
[also build test ERROR on arm/for-next arm/fixes soc/for-next linus/master v6.10-rc6 next-20240703]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nicolas-Pitre/lib-math-test_div64-add-some-edge-cases-relevant-to-__div64_const32/20240708-013344
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
patch link:    https://lore.kernel.org/r/20240707171919.1951895-5-nico%40fluxnic.net
patch subject: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
config: arm-randconfig-003-20240708 (https://download.01.org/0day-ci/archive/20240708/202407080326.fBdpm1Tq-lkp@intel.com/config)
compiler: clang version 16.0.6 (https://github.com/llvm/llvm-project 7cbf1a2591520c2491aa35339f227775f4d3adf6)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240708/202407080326.fBdpm1Tq-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407080326.fBdpm1Tq-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/arm/kernel/asm-offsets.c:11:
   In file included from include/linux/sched.h:16:
   In file included from include/linux/cpumask.h:11:
   In file included from include/linux/kernel.h:27:
   In file included from include/linux/math.h:6:
>> arch/arm/include/asm/div64.h:60:1: error: type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int [-Wimplicit-int]
   __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
   ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:98:11: warning: array index 3 is past the end of the array (that has type 'unsigned long[2]') [-Warray-bounds]
                   return (set->sig[3] | set->sig[2] |
                           ^        ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:98:25: warning: array index 2 is past the end of the array (that has type 'unsigned long[2]') [-Warray-bounds]
                   return (set->sig[3] | set->sig[2] |
                                         ^        ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:114:11: warning: array index 3 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
                   return  (set1->sig[3] == set2->sig[3]) &&
                            ^         ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:114:27: warning: array index 3 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
                   return  (set1->sig[3] == set2->sig[3]) &&
                                            ^         ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:115:5: warning: array index 2 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
                           (set1->sig[2] == set2->sig[2]) &&
                            ^         ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:115:21: warning: array index 2 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
                           (set1->sig[2] == set2->sig[2]) &&
                                            ^         ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:157:1: warning: array index 3 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
   _SIG_SET_BINOP(sigorsets, _sig_or)
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/signal.h:138:8: note: expanded from macro '_SIG_SET_BINOP'
                   a3 = a->sig[3]; a2 = a->sig[2];                         \
                        ^      ~
   arch/arm/include/asm/signal.h:17:2: note: array 'sig' declared here
           unsigned long sig[_NSIG_WORDS];
           ^
   In file included from arch/arm/kernel/asm-offsets.c:12:
   In file included from include/linux/mm.h:1120:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:


vim +/int +60 arch/arm/include/asm/div64.h

    54	
    55	#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
    56	static __always_inline
    57	#else
    58	static inline
    59	#endif
  > 60	__arch_xprod_64(uint64_t m, uint64_t n, bool bias)
    61	{
    62		unsigned long long res;
    63		register unsigned int tmp asm("ip") = 0;
    64		bool no_ovf = __builtin_constant_p(m) &&
    65			      ((m >> 32) + (m & 0xffffffff) < 0x100000000);
    66	
    67		if (!bias) {
    68			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
    69				"mov	%Q0, #0"
    70				: "=&r" (res)
    71				: "r" (m), "r" (n)
    72				: "cc");
    73		} else if (no_ovf) {
    74			res = m;
    75			asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
    76				"mov	%Q0, #0"
    77				: "+&r" (res)
    78				: "r" (m), "r" (n)
    79				: "cc");
    80		} else {
    81			asm (	"umull	%Q0, %R0, %Q2, %Q3\n\t"
    82				"cmn	%Q0, %Q2\n\t"
    83				"adcs	%R0, %R0, %R2\n\t"
    84				"adc	%Q0, %1, #0"
    85				: "=&r" (res), "+&r" (tmp)
    86				: "r" (m), "r" (n)
    87				: "cc");
    88		}
    89	
    90		if (no_ovf) {
    91			asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
    92				"umlal	%R0, %Q0, %Q1, %R2\n\t"
    93				"mov	%R0, #0\n\t"
    94				"umlal	%Q0, %R0, %R1, %R2"
    95				: "+&r" (res)
    96				: "r" (m), "r" (n)
    97				: "cc");
    98		} else {
    99			asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"
   100				"umlal	%R0, %1, %Q2, %R3\n\t"
   101				"mov	%R0, #0\n\t"
   102				"adds	%Q0, %1, %Q0\n\t"
   103				"adc	%R0, %R0, #0\n\t"
   104				"umlal	%Q0, %R0, %R2, %R3"
   105				: "+&r" (res), "+&r" (tmp)
   106				: "r" (m), "r" (n)
   107				: "cc");
   108		}
   109	
   110		return res;
   111	}
   112	#define __arch_xprod_64 __arch_xprod_64
   113	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 17:17 ` [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
  2024-07-07 18:59   ` Arnd Bergmann
  2024-07-07 19:40   ` kernel test robot
@ 2024-07-07 19:40   ` kernel test robot
  2 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2024-07-07 19:40 UTC (permalink / raw)
  To: Nicolas Pitre, Arnd Bergmann, Russell King
  Cc: oe-kbuild-all, Nicolas Pitre, linux-arch, linux-kernel

Hi Nicolas,

kernel test robot noticed the following build errors:

[auto build test ERROR on arnd-asm-generic/master]
[also build test ERROR on arm/for-next arm/fixes soc/for-next linus/master v6.10-rc6 next-20240703]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nicolas-Pitre/lib-math-test_div64-add-some-edge-cases-relevant-to-__div64_const32/20240708-013344
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
patch link:    https://lore.kernel.org/r/20240707171919.1951895-5-nico%40fluxnic.net
patch subject: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
config: arm-randconfig-002-20240708 (https://download.01.org/0day-ci/archive/20240708/202407080355.VUEmeBsv-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240708/202407080355.VUEmeBsv-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407080355.VUEmeBsv-lkp@intel.com/

All errors (new ones prefixed by >>):

   scripts/genksyms/parse.y: warning: 9 shift/reduce conflicts [-Wconflicts-sr]
   scripts/genksyms/parse.y: warning: 5 reduce/reduce conflicts [-Wconflicts-rr]
   scripts/genksyms/parse.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples
   In file included from include/linux/math.h:6,
                    from include/linux/kernel.h:27,
                    from include/linux/cpumask.h:11,
                    from include/linux/sched.h:16,
                    from arch/arm/kernel/asm-offsets.c:11:
>> arch/arm/include/asm/div64.h:60:1: error: return type defaults to 'int' [-Werror=implicit-int]
      60 | __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
         | ^~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors
   make[3]: *** [scripts/Makefile.build:117: arch/arm/kernel/asm-offsets.s] Error 1 shuffle=2335528022
   make[3]: Target 'prepare' not remade because of errors.
   make[2]: *** [Makefile:1208: prepare0] Error 2 shuffle=2335528022
   make[2]: Target 'prepare' not remade because of errors.
   make[1]: *** [Makefile:240: __sub-make] Error 2 shuffle=2335528022
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [Makefile:240: __sub-make] Error 2 shuffle=2335528022
   make: Target 'prepare' not remade because of errors.


vim +/int +60 arch/arm/include/asm/div64.h

    54	
    55	#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
    56	static __always_inline
    57	#else
    58	static inline
    59	#endif
  > 60	__arch_xprod_64(uint64_t m, uint64_t n, bool bias)
    61	{
    62		unsigned long long res;
    63		register unsigned int tmp asm("ip") = 0;
    64		bool no_ovf = __builtin_constant_p(m) &&
    65			      ((m >> 32) + (m & 0xffffffff) < 0x100000000);
    66	
    67		if (!bias) {
    68			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
    69				"mov	%Q0, #0"
    70				: "=&r" (res)
    71				: "r" (m), "r" (n)
    72				: "cc");
    73		} else if (no_ovf) {
    74			res = m;
    75			asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
    76				"mov	%Q0, #0"
    77				: "+&r" (res)
    78				: "r" (m), "r" (n)
    79				: "cc");
    80		} else {
    81			asm (	"umull	%Q0, %R0, %Q2, %Q3\n\t"
    82				"cmn	%Q0, %Q2\n\t"
    83				"adcs	%R0, %R0, %R2\n\t"
    84				"adc	%Q0, %1, #0"
    85				: "=&r" (res), "+&r" (tmp)
    86				: "r" (m), "r" (n)
    87				: "cc");
    88		}
    89	
    90		if (no_ovf) {
    91			asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
    92				"umlal	%R0, %Q0, %Q1, %R2\n\t"
    93				"mov	%R0, #0\n\t"
    94				"umlal	%Q0, %R0, %R1, %R2"
    95				: "+&r" (res)
    96				: "r" (m), "r" (n)
    97				: "cc");
    98		} else {
    99			asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"
   100				"umlal	%R0, %1, %Q2, %R3\n\t"
   101				"mov	%R0, #0\n\t"
   102				"adds	%Q0, %1, %Q0\n\t"
   103				"adc	%R0, %R0, #0\n\t"
   104				"umlal	%Q0, %R0, %R2, %R3"
   105				: "+&r" (res), "+&r" (tmp)
   106				: "r" (m), "r" (n)
   107				: "cc");
   108		}
   109	
   110		return res;
   111	}
   112	#define __arch_xprod_64 __arch_xprod_64
   113	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-07 19:37       ` Arnd Bergmann
@ 2024-07-08  1:21         ` Nicolas Pitre
  2024-07-08  5:21           ` Arnd Bergmann
  0 siblings, 1 reply; 12+ messages in thread
From: Nicolas Pitre @ 2024-07-08  1:21 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Russell King, Linux-Arch, linux-kernel

On Sun, 7 Jul 2024, Arnd Bergmann wrote:

> On Sun, Jul 7, 2024, at 21:14, Nicolas Pitre wrote:
> > On Sun, 7 Jul 2024, Arnd Bergmann wrote:
> >
> >> On Sun, Jul 7, 2024, at 19:17, Nicolas Pitre wrote:
> >> > From: Nicolas Pitre <npitre@baylibre.com>
> >> >
> >> > Recent gcc versions started not systematically inline __arch_xprod64()
> >> > and that has performance implications. Give the compiler the freedom to
> >> > decide only when optimizing for size.
> >> >
> >> > Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
> >> 
> >> Seems reasonable. Just to make sure: do you know if the non-inline
> >> version of xprod_64 ends up producing a more effecient division
> >> result than the __do_div64() code path on arch/arm?
> >
> > __arch_xprod_64() is part of the __do_div64() code path. So I'm not sure 
> > of your question.
> >
> > Obviously, having __arch_xprod_64() inlined is faster but it increases 
> > binary size.
> 
> I meant whether calling __div64_const32->__arch_xprod_64() is
> still faster for a constant base when the new __arch_xprod_64()
> is out of line, compared to the __div64_32->__do_div64()
> assembly code path we take for a non-constant base.

Oh, most likely yes. The non-constant base has to go through the whole 
one-bit-at-a-time division loop whereas the constant base with 
__div64_const32 results in 4 64-bits multiply and add. Moving 
__arch_xprod_64() out of line adds the argument shuffling overhead and 
it can't skip overflow handling, but still.

Here's some numbers. With latest patches using __always_inline:

test_div64: Starting 64bit/32bit division and modulo test
test_div64: Completed 64bit/32bit division and modulo test, 0.048285584s elapsed

Latest patches but __always_inline left out:

test_div64: Starting 64bit/32bit division and modulo test
test_div64: Completed 64bit/32bit division and modulo test, 0.053023584s elapsed

Forcing both constant and non-constant base through the same path:

test_div64: Starting 64bit/32bit division and modulo test
test_div64: Completed 64bit/32bit division and modulo test, 0.103263776s elapsed

It is worth noting that test_div64 does half the test with non constant 
divisors already so the impact is greater than what those numbers show.

And for what it is worth, those numbers were obtained using QEMU. The 
gcc version is 14.1.0.


Nicolas

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance
  2024-07-08  1:21         ` Nicolas Pitre
@ 2024-07-08  5:21           ` Arnd Bergmann
  0 siblings, 0 replies; 12+ messages in thread
From: Arnd Bergmann @ 2024-07-08  5:21 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Russell King, Linux-Arch, linux-kernel

On Mon, Jul 8, 2024, at 03:21, Nicolas Pitre wrote:
> On Sun, 7 Jul 2024, Arnd Bergmann wrote:
>> On Sun, Jul 7, 2024, at 21:14, Nicolas Pitre wrote:
>
> Oh, most likely yes. The non-constant base has to go through the whole 
> one-bit-at-a-time division loop whereas the constant base with 
> __div64_const32 results in 4 64-bits multiply and add. Moving 
> __arch_xprod_64() out of line adds the argument shuffling overhead and 
> it can't skip overflow handling, but still.
>
> Here's some numbers. With latest patches using __always_inline:
>
> test_div64: Starting 64bit/32bit division and modulo test
> test_div64: Completed 64bit/32bit division and modulo test, 0.048285584s elapsed
>
> Latest patches but __always_inline left out:
>
> test_div64: Starting 64bit/32bit division and modulo test
> test_div64: Completed 64bit/32bit division and modulo test, 0.053023584s elapsed
>
> Forcing both constant and non-constant base through the same path:
>
> test_div64: Starting 64bit/32bit division and modulo test
> test_div64: Completed 64bit/32bit division and modulo test, 0.103263776s elapsed
>
> It is worth noting that test_div64 does half the test with non constant 
> divisors already so the impact is greater than what those numbers show.
>
> And for what it is worth, those numbers were obtained using QEMU. The 
> gcc version is 14.1.0.

Right, so with the numbers in qemu matching your explanation,
that seems reasonable to assume it will behave the same way
across a wide range of physical CPUs.

    Arnd

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-07-08  5:21 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-07 17:17 [PATCH v2 0/4] do_div() with constant divisor simplification Nicolas Pitre
2024-07-07 17:17 ` [PATCH v2 1/4] lib/math/test_div64: add some edge cases relevant to __div64_const32() Nicolas Pitre
2024-07-07 17:17 ` [PATCH v2 2/4] asm-generic/div64: optimize/simplify __div64_const32() Nicolas Pitre
2024-07-07 17:17 ` [PATCH v2 3/4] ARM: div64: improve __arch_xprod_64() Nicolas Pitre
2024-07-07 17:17 ` [PATCH v2 4/4] __arch_xprod64(): make __always_inline when optimizing for performance Nicolas Pitre
2024-07-07 18:59   ` Arnd Bergmann
2024-07-07 19:14     ` Nicolas Pitre
2024-07-07 19:37       ` Arnd Bergmann
2024-07-08  1:21         ` Nicolas Pitre
2024-07-08  5:21           ` Arnd Bergmann
2024-07-07 19:40   ` kernel test robot
2024-07-07 19:40   ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox