public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] md/raid6 algorithms: scale test duration for speedier boots
@ 2024-08-05 17:08 Colin Ian King
  2024-08-06  3:20 ` kernel test robot
  2024-08-06  4:01 ` kernel test robot
  0 siblings, 2 replies; 8+ messages in thread
From: Colin Ian King @ 2024-08-05 17:08 UTC (permalink / raw)
  To: Andrew Morton, Masahiro Yamada, rppt; +Cc: linux-kernel

Instead of using jiffies and waiting for jiffies to wrap before
measuring use the higher precision local_time for benchmarking.
Measure 2500 loops, which works out to be accurate enough for
benchmarking the raid algo data rates. Also add division by zero
checking in case timing measurements are bogus.

Speeds up raid benchmarkingm, reduces calibration time from
48,000 usecs to 4000 usecs on a i9-19200.

Signed-off-by: Colin Ian King <colin.king@intel.com>
---
 lib/raid6/algos.c | 56 ++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index cd2e88ee1..5d974ae8d 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -18,6 +18,8 @@
 #else
 #include <linux/module.h>
 #include <linux/gfp.h>
+#include <linux/sched/clock.h>
+
 /* In .bss so it's zeroed */
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
 EXPORT_SYMBOL(raid6_empty_zero_page);
@@ -155,12 +157,15 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 static inline const struct raid6_calls *raid6_choose_gen(
 	void *(*const dptrs)[RAID6_TEST_DISKS], const int disks)
 {
-	unsigned long perf, bestgenperf, j0, j1;
+	unsigned long perf;
+	const unsigned long max_perf = 2500;
 	int start = (disks>>1)-1, stop = disks-3;	/* work on the second half of the disks */
 	const struct raid6_calls *const *algo;
 	const struct raid6_calls *best;
+	const u64 ns_per_mb = 1000000000 >> 20;
+	u64 n, ns, t, ns_best = ~0ULL;
 
-	for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+	for (best = NULL, algo = raid6_algos; *algo; algo++) {
 		if (!best || (*algo)->priority >= best->priority) {
 			if ((*algo)->valid && !(*algo)->valid())
 				continue;
@@ -170,26 +175,20 @@ static inline const struct raid6_calls *raid6_choose_gen(
 				break;
 			}
 
-			perf = 0;
-
 			preempt_disable();
-			j0 = jiffies;
-			while ((j1 = jiffies) == j0)
-				cpu_relax();
-			while (time_before(jiffies,
-					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+			t = local_clock();
+			for (perf = 0; perf < max_perf; perf++)
 				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
-				perf++;
-			}
+			ns = local_clock() - t;
 			preempt_enable();
 
-			if (perf > bestgenperf) {
-				bestgenperf = perf;
+			if (ns < ns_best) {
+				ns_best = ns;
 				best = *algo;
 			}
-			pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name,
-				(perf * HZ * (disks-2)) >>
-				(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+			n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2);
+			pr_info("raid6: %-8s gen() %5llu MB/s (%llu ns)\n", (*algo)->name,
+				(ns > 0) ? n / ns : 0, ns);
 		}
 	}
 
@@ -206,31 +205,24 @@ static inline const struct raid6_calls *raid6_choose_gen(
 		goto out;
 	}
 
-	pr_info("raid6: using algorithm %s gen() %ld MB/s\n",
-		best->name,
-		(bestgenperf * HZ * (disks - 2)) >>
-		(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+	n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2);
+	pr_info("raid6: using algorithm %s gen() %llu MB/s (%llu ns)\n",
+		best->name, (ns_best > 0) ? n / ns_best : 0, ns_best);
 
 	if (best->xor_syndrome) {
-		perf = 0;
-
 		preempt_disable();
-		j0 = jiffies;
-		while ((j1 = jiffies) == j0)
-			cpu_relax();
-		while (time_before(jiffies,
-				   j1 + (1 << RAID6_TIME_JIFFIES_LG2))) {
+		t = local_clock();
+		for (perf = 0; perf < max_perf; perf++) {
 			best->xor_syndrome(disks, start, stop,
 					   PAGE_SIZE, *dptrs);
-			perf++;
 		}
+		ns = local_clock() - t;
 		preempt_enable();
 
-		pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
-			(perf * HZ * (disks - 2)) >>
-			(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1));
+		n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2);
+		pr_info("raid6: .... xor() %llu MB/s, rmw enabled (%llu ns)\n",
+			(ns > 0) ? n / ns : 0, ns);
 	}
-
 out:
 	return best;
 }
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread
* [PATCH] md/raid6 algorithms: scale test duration for speedier boots
@ 2025-04-07 14:31 Colin Ian King
  2025-04-07 18:35 ` kernel test robot
  2025-04-08  2:58 ` kernel test robot
  0 siblings, 2 replies; 8+ messages in thread
From: Colin Ian King @ 2025-04-07 14:31 UTC (permalink / raw)
  To: Andrew Morton, Song Liu; +Cc: linux-kernel

Instead of using jiffies (and waiting for jiffies to wrap before
benchmarking the algorithms) instead use the higher precision local_time
for benchmarking. This patch performs 2,500 iterations of the benchmark
measurements which works out to be accurate enough for benchmarking the
raid algorithm data rates. Also add division by zero checking in case
timing measurements are bogus.

Measuring 100 re-boots on Intel(R) Core(TM) Ultra 9 285K with
improves raid64 benchmarking loop from ~68000 usecs to ~5300 usec.

This patch has been in use in Clear Linux for ~2 years w/o issues.

Signed-off-by: Colin Ian King <colin.king@intel.com>
---
 lib/raid6/algos.c | 53 ++++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index cd2e88ee1f14..b846635542bc 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -18,6 +18,8 @@
 #else
 #include <linux/module.h>
 #include <linux/gfp.h>
+#include <linux/sched/clock.h>
+
 /* In .bss so it's zeroed */
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
 EXPORT_SYMBOL(raid6_empty_zero_page);
@@ -155,12 +157,15 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 static inline const struct raid6_calls *raid6_choose_gen(
 	void *(*const dptrs)[RAID6_TEST_DISKS], const int disks)
 {
-	unsigned long perf, bestgenperf, j0, j1;
+	unsigned long perf;
+	const unsigned long max_perf = 2500;
 	int start = (disks>>1)-1, stop = disks-3;	/* work on the second half of the disks */
 	const struct raid6_calls *const *algo;
 	const struct raid6_calls *best;
+	const u64 ns_per_mb = 1000000000 >> 20;
+	u64 n, ns, t, ns_best = ~0ULL;
 
-	for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+	for (best = NULL, algo = raid6_algos; *algo; algo++) {
 		if (!best || (*algo)->priority >= best->priority) {
 			if ((*algo)->valid && !(*algo)->valid())
 				continue;
@@ -170,26 +175,20 @@ static inline const struct raid6_calls *raid6_choose_gen(
 				break;
 			}
 
-			perf = 0;
-
 			preempt_disable();
-			j0 = jiffies;
-			while ((j1 = jiffies) == j0)
-				cpu_relax();
-			while (time_before(jiffies,
-					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+			t = local_clock();
+			for (perf = 0; perf < max_perf; perf++) {
 				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
-				perf++;
 			}
+			ns = local_clock() - t;
 			preempt_enable();
 
-			if (perf > bestgenperf) {
-				bestgenperf = perf;
+			if (ns < ns_best) {
+				ns_best = ns;
 				best = *algo;
 			}
-			pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name,
-				(perf * HZ * (disks-2)) >>
-				(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+			n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2);
+			pr_info("raid6: %-8s gen() %5llu MB/s (%llu ns)\n", (*algo)->name, (ns > 0) ? n / ns : 0, ns);
 		}
 	}
 
@@ -206,31 +205,23 @@ static inline const struct raid6_calls *raid6_choose_gen(
 		goto out;
 	}
 
-	pr_info("raid6: using algorithm %s gen() %ld MB/s\n",
-		best->name,
-		(bestgenperf * HZ * (disks - 2)) >>
-		(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+	n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2);
+	pr_info("raid6: using algorithm %s gen() %llu MB/s (%llu ns)\n",
+		best->name, (ns_best > 0) ? n / ns_best : 0, ns_best);
 
 	if (best->xor_syndrome) {
-		perf = 0;
-
 		preempt_disable();
-		j0 = jiffies;
-		while ((j1 = jiffies) == j0)
-			cpu_relax();
-		while (time_before(jiffies,
-				   j1 + (1 << RAID6_TIME_JIFFIES_LG2))) {
+		t = local_clock();
+		for (perf = 0; perf < max_perf; perf++) {
 			best->xor_syndrome(disks, start, stop,
 					   PAGE_SIZE, *dptrs);
-			perf++;
 		}
+		ns = local_clock() - t;
 		preempt_enable();
 
-		pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
-			(perf * HZ * (disks - 2)) >>
-			(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1));
+		n = max_perf * PAGE_SIZE * ns_per_mb * (disks - 2);
+		pr_info("raid6: .... xor() %llu MB/s, rmw enabled (%llu ns)\n", (ns > 0) ? n / ns : 0, ns);
 	}
-
 out:
 	return best;
 }
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread
* [PATCH] md/raid6 algorithms: scale test duration for speedier boots
@ 2021-09-09 13:59 Colin King
  0 siblings, 0 replies; 8+ messages in thread
From: Colin King @ 2021-09-09 13:59 UTC (permalink / raw)
  To: Neil Brown; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

The original code runs for a set run time based on the duration of
2^RAID6_TIME_JIFFIES_LG2. The default kernel value for
RAID6_TIME_JIFFIES_LG2 is 4, however, emperical testing shows that a
value of 3.5 is the sweet spot for getting consistent benchmarking
results and speeding up the run time of the benchmarking.

To achieve 2^3.5 we use the following:
   2^3.5 = 2^4 / 2^0.5
         = 2^4 / sqrt(2)
         = 2^4 * 0.707106781

Too keep this as integer math that is as accurate as required and avoiding
overflow, this becomes:
         = 2^4 * 181 / 256
         = (2^4 * 181) >> 8

We also need to scale down perf by the same factor, however, to
get a good approximate integer result without an overflow we scale
by 2^4.0 * sqrt(2) =
         = 2 ^ 4 * 1.41421356237
         = 2 ^ 4 * 1448 / 1024
         = (2 ^ 4 * 1448) >> 10

This has been tested on 2 AWS instances, a small t2 and a medium m3
with 30 boot tests each and compared to the same instances booted 30
times on an umodified kernel. In all results, we get the same
algorithms being selected and a 100% consistent result over the 30
boots, showing that this optimised jiffy timing scaling does not break
the original functionality.

On the t2.small we see a saving of ~0.126 seconds and t3.medium a saving of
~0.18 seconds.

Tested on a 4 CPU VM on an 8 thread Xeon server; seeing a saving of ~0.35
seconds (average over 50 boots).

The testing included double checking the algorithm chosen by the optimized
selection and seeing the same as pre-optimised version.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 lib/raid6/algos.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 6d5e5000fdd7..5d5b04632168 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -152,6 +152,10 @@ static inline const struct raid6_calls *raid6_choose_gen(
 
 	for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
 		if (!best || (*algo)->prefer >= best->prefer) {
+			/* 2 ^ (RAID6_TIME_JIFFIES_LG2 - 0.5) */
+			const unsigned long raid6_time_jiffies =
+				((1 << RAID6_TIME_JIFFIES_LG2) * 181) >> 8;
+
 			if ((*algo)->valid && !(*algo)->valid())
 				continue;
 
@@ -167,7 +171,7 @@ static inline const struct raid6_calls *raid6_choose_gen(
 			while ((j1 = jiffies) == j0)
 				cpu_relax();
 			while (time_before(jiffies,
-					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+					    j1 + raid6_time_jiffies)) {
 				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
 				perf++;
 			}
@@ -178,8 +182,8 @@ static inline const struct raid6_calls *raid6_choose_gen(
 				best = *algo;
 			}
 			pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name,
-				(perf * HZ * (disks-2)) >>
-				(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+				(((perf * HZ * (disks-2)) >>
+				 (20 - 16 + RAID6_TIME_JIFFIES_LG2)) * 1448) >> 10);
 
 			if (!(*algo)->xor_syndrome)
 				continue;
@@ -191,7 +195,7 @@ static inline const struct raid6_calls *raid6_choose_gen(
 			while ((j1 = jiffies) == j0)
 				cpu_relax();
 			while (time_before(jiffies,
-					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+					    j1 + raid6_time_jiffies)) {
 				(*algo)->xor_syndrome(disks, start, stop,
 						      PAGE_SIZE, *dptrs);
 				perf++;
@@ -202,8 +206,8 @@ static inline const struct raid6_calls *raid6_choose_gen(
 				bestxorperf = perf;
 
 			pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name,
-				(perf * HZ * (disks-2)) >>
-				(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1));
+				(((perf * HZ * (disks-2)) >>
+				 (20 - 16 + RAID6_TIME_JIFFIES_LG2 + 1)) * 1448) >> 10);
 		}
 	}
 
@@ -215,8 +219,8 @@ static inline const struct raid6_calls *raid6_choose_gen(
 				(20 - PAGE_SHIFT+RAID6_TIME_JIFFIES_LG2));
 			if (best->xor_syndrome)
 				pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
-					(bestxorperf * HZ * (disks-2)) >>
-					(20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1));
+					(((bestxorperf * HZ * (disks-2)) >>
+					 (20 - 16 + RAID6_TIME_JIFFIES_LG2 + 1)) * 1448) >> 10);
 		} else
 			pr_info("raid6: skip pq benchmark and using algorithm %s\n",
 				best->name);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-04-08  2:59 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-05 17:08 [PATCH] md/raid6 algorithms: scale test duration for speedier boots Colin Ian King
2024-08-06  3:20 ` kernel test robot
2024-08-06  4:01 ` kernel test robot
2024-08-06 21:28   ` King, Colin
  -- strict thread matches above, loose matches on Subject: below --
2025-04-07 14:31 Colin Ian King
2025-04-07 18:35 ` kernel test robot
2025-04-08  2:58 ` kernel test robot
2021-09-09 13:59 Colin King

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox