[PATCH] perf: optimize clear page in Intel specified model with movq instruction

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] perf: optimize clear page in Intel specified model with movq instruction
@ 2021-09-09  8:45 Jinhua Wu
  2021-09-09  9:39 ` Borislav Petkov
  0 siblings, 1 reply; 10+ messages in thread
From: Jinhua Wu @ 2021-09-09  8:45 UTC (permalink / raw)
  To: x86
  Cc: zelin.deng, jiayu.ni, wujinhua, ak, luming.yu, fan.du, artie.ding,
	tony.luck, tglx, bp, linux-kernel, pawan.kumar.gupta, fenghua.yu,
	hpa, ricardo.neri-calderon, peterz

Clear page is the most time-consuming procedure in page fault handling.
Kernel use fast-string instruction to clear page. We found that in specified
Intel model such as CPX and ICX, the movq instruction perform much better
than fast-string instruction when corresponding page is not in cache.
But when the page is in cache, fast string perform better. We show the test
result in the following:

machine: Intel CPX

Allocated memory size      Page fault latency per 4K byte
                          rep stosb             movq
--------------------    ----------------  ------------------
        8MB               2057.13ns           1338.38ns
        64MB              1850.71ns           1200.20ns
        512MB             1918.40ns           1196.91ns
        4096MB            1931.24ns           1189.41ns

We can find that there is 40% performance improvement. So we add a blacklist
for Intel specified model, in which we use movq instruction to clear page.

Signed-off-by: Jinhua Wu <wujinhua@linux.alibaba.com>
Signed-off-by: Jiayu Ni <jiayu.ni@linux.alibaba.com>
Signed-off-by: Artie Ding <artie.ding@linux.alibaba.com>
---
 arch/x86/include/asm/page_64.h | 18 ++++++++++++------
 arch/x86/kernel/cpu/intel.c    | 22 ++++++++++++++++++++++
 arch/x86/mm/init.c             |  9 +++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4bde0dc..1fedfbe 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -7,6 +7,8 @@
 #ifndef __ASSEMBLY__
 #include <asm/alternative.h>
 
+#include <linux/jump_label.h>
+
 /* duplicated to the one in bootmem.h */
 extern unsigned long max_pfn;
 extern unsigned long phys_base;
@@ -43,15 +45,19 @@ static inline unsigned long __phys_addr_nodebug(unsigned long x)
 void clear_page_orig(void *page);
 void clear_page_rep(void *page);
 void clear_page_erms(void *page);
+extern struct static_key_false clear_page_movq_key;
 
 static inline void clear_page(void *page)
 {
-	alternative_call_2(clear_page_orig,
-			   clear_page_rep, X86_FEATURE_REP_GOOD,
-			   clear_page_erms, X86_FEATURE_ERMS,
-			   "=D" (page),
-			   "0" (page)
-			   : "cc", "memory", "rax", "rcx");
+	if (static_branch_unlikely(&clear_page_movq_key))
+		clear_page_orig(page);
+	else
+		alternative_call_2(clear_page_orig,
+				clear_page_rep, X86_FEATURE_REP_GOOD,
+				clear_page_erms, X86_FEATURE_ERMS,
+				"=D" (page),
+				"0" (page)
+				: "cc", "memory", "rax", "rcx");
 }
 
 void copy_page(void *to, void *from);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8321c43..3366da0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -38,6 +38,28 @@
 #include <asm/apic.h>
 #endif
 
+/* Optimize clear page with movq in specific Intel CPU */
+#include <asm/cpu_device_id.h>
+#include <linux/jump_label.h>
+
+DEFINE_STATIC_KEY_FALSE(clear_page_movq_key);
+EXPORT_SYMBOL_GPL(clear_page_movq_key);
+
+extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
+
+const struct x86_cpu_id faststring_blacklist_match[] __initconst = {
+	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, 0),
+	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_XEON_D, 0)
+};
+
+void enable_clear_page_movq(void)
+{
+    if (x86_match_cpu(faststring_blacklist_match))
+		static_branch_enable(&clear_page_movq_key);
+}
+
+
 enum split_lock_detect_state {
 	sld_off = 0,
 	sld_warn,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 23a14d8..480c189 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,6 +28,12 @@
 #include <asm/memtype.h>
 
 /*
+ * Optimize clear page with movq in specific Intel CPU
+ * Definition in intel.c
+ */
+extern void enable_clear_page_movq(void);
+
+/*
  * We need to define the tracepoints somewhere, and tlb.c
  * is only compiled when SMP=y.
  */
@@ -775,6 +781,9 @@ void __init init_mem_mapping(void)
 
 	x86_init.hyper.init_mem_mapping();
 
+	/* Optimize clear page with mov in specific Intel CPU */
+	enable_clear_page_movq();
+
 	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
 }
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf: optimize clear page in Intel specified model with movq instruction
  2021-09-09  8:45 [PATCH] perf: optimize clear page in Intel specified model with movq instruction Jinhua Wu
@ 2021-09-09  9:39 ` Borislav Petkov
  2021-09-09 10:34   ` Luming Yu
       [not found]   ` <bf6fe59d-c760-40d4-8201-4170cd90ffc3.wujinhua@linux.alibaba.com>
  0 siblings, 2 replies; 10+ messages in thread
From: Borislav Petkov @ 2021-09-09  9:39 UTC (permalink / raw)
  To: Jinhua Wu
  Cc: x86, zelin.deng, jiayu.ni, ak, luming.yu, fan.du, artie.ding,
	tony.luck, tglx, linux-kernel, pawan.kumar.gupta, fenghua.yu, hpa,
	ricardo.neri-calderon, peterz

On Thu, Sep 09, 2021 at 04:45:51PM +0800, Jinhua Wu wrote:
> Clear page is the most time-consuming procedure in page fault handling.
> Kernel use fast-string instruction to clear page. We found that in specified
> Intel model such as CPX and ICX, the movq instruction perform much better
> than fast-string instruction when corresponding page is not in cache.
> But when the page is in cache, fast string perform better. We show the test
> result in the following:

What you should do is show the extensive tests you've run with
real-world benchmarks where you really can show 40% performance
improvement.

Also, the static branch "approach" you're using ain't gonna happen. If
anything, another X86_FEATURE_* bit.

Good luck.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf: optimize clear page in Intel specified model with movq instruction
  2021-09-09  9:39 ` Borislav Petkov
@ 2021-09-09 10:34   ` Luming Yu
  2021-09-09 10:44     ` Borislav Petkov
  2021-09-09 11:18     ` Peter Zijlstra
       [not found]   ` <bf6fe59d-c760-40d4-8201-4170cd90ffc3.wujinhua@linux.alibaba.com>
  1 sibling, 2 replies; 10+ messages in thread
From: Luming Yu @ 2021-09-09 10:34 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Jinhua Wu, the arch/x86 maintainers, zelin.deng, jiayu.ni,
	Andi Kleen, Luming Yu, fan.du, artie.ding, Luck, Tony,
	Thomas Gleixner, LKML, pawan.kumar.gupta, Yu, Fenghua,
	H. Peter Anvin, ricardo.neri-calderon, Peter Zijlstra

On Thu, Sep 9, 2021 at 5:41 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, Sep 09, 2021 at 04:45:51PM +0800, Jinhua Wu wrote:
> > Clear page is the most time-consuming procedure in page fault handling.
> > Kernel use fast-string instruction to clear page. We found that in specified
> > Intel model such as CPX and ICX, the movq instruction perform much better
> > than fast-string instruction when corresponding page is not in cache.
> > But when the page is in cache, fast string perform better. We show the test
> > result in the following:
>
> What you should do is show the extensive tests you've run with
> real-world benchmarks where you really can show 40% performance
> improvement.
>
> Also, the static branch "approach" you're using ain't gonna happen. If
> anything, another X86_FEATURE_* bit.

do you mean jump label would not be replaced to nop when its key is enabled?
so we could not use it in certain functions?
I don't understand exactly what "ain't  gonna happen"
>
> Good luck.
>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf: optimize clear page in Intel specified model with movq instruction
  2021-09-09 10:34   ` Luming Yu
@ 2021-09-09 10:44     ` Borislav Petkov
  2021-09-09 11:18     ` Peter Zijlstra
  1 sibling, 0 replies; 10+ messages in thread
From: Borislav Petkov @ 2021-09-09 10:44 UTC (permalink / raw)
  To: Luming Yu
  Cc: Jinhua Wu, the arch/x86 maintainers, zelin.deng, jiayu.ni,
	Andi Kleen, Luming Yu, fan.du, artie.ding, Luck, Tony,
	Thomas Gleixner, LKML, pawan.kumar.gupta, Yu, Fenghua,
	H. Peter Anvin, ricardo.neri-calderon, Peter Zijlstra

On Thu, Sep 09, 2021 at 06:34:40PM +0800, Luming Yu wrote:
> do you mean jump label would not be replaced to nop when its key is enabled?
> so we could not use it in certain functions?
> I don't understand exactly what "ain't  gonna happen"

It means, you need to use an X86_FEATURE_ bit because I won't accept a
static key.

But do the benchmarks first.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf: optimize clear page in Intel specified model with movq instruction
  2021-09-09 10:34   ` Luming Yu
  2021-09-09 10:44     ` Borislav Petkov
@ 2021-09-09 11:18     ` Peter Zijlstra
  1 sibling, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2021-09-09 11:18 UTC (permalink / raw)
  To: Luming Yu
  Cc: Borislav Petkov, Jinhua Wu, the arch/x86 maintainers, zelin.deng,
	jiayu.ni, Andi Kleen, Luming Yu, fan.du, artie.ding, Luck, Tony,
	Thomas Gleixner, LKML, pawan.kumar.gupta, Yu, Fenghua,
	H. Peter Anvin, ricardo.neri-calderon

On Thu, Sep 09, 2021 at 06:34:40PM +0800, Luming Yu wrote:

> do you mean jump label would not be replaced to nop when its key is enabled?
> so we could not use it in certain functions?

But why add a jump label when you can make that alternative DTRT ?


^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <bf6fe59d-c760-40d4-8201-4170cd90ffc3.wujinhua@linux.alibaba.com>]

* Re: 回复：[PATCH] perf: optimize clear page in Intel specified model with movq instruction
       [not found]   ` <bf6fe59d-c760-40d4-8201-4170cd90ffc3.wujinhua@linux.alibaba.com>
@ 2021-09-09 11:07     ` Borislav Petkov
  2021-09-09 14:51       ` Luming Yu
  0 siblings, 1 reply; 10+ messages in thread
From: Borislav Petkov @ 2021-09-09 11:07 UTC (permalink / raw)
  To: wujinhua
  Cc: x86, zelin.deng, jiayu.ni, ak, luming.yu, fan.du, artie.ding,
	tony.luck, tglx, linux-kernel, pawan.kumar.gupta, fenghua.yu, hpa,
	ricardo.neri-calderon, peterz

On Thu, Sep 09, 2021 at 07:02:08PM +0800, wujinhua wrote:
> I provide three tests and result for you. All the tests run in Intel
> CPX.

I said

"What you should do is show the extensive tests you've run with
real-world benchmarks..."

Are your tests real-world benchmarks?

IOW, no microbenchmarks please.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: 回复：[PATCH] perf: optimize clear page in Intel specified model with movq instruction
  2021-09-09 11:07     ` 回复：[PATCH] " Borislav Petkov
@ 2021-09-09 14:51       ` Luming Yu
       [not found]         ` <1cac1499-6b00-3c18-b64c-a22f269a2706@linux.alibaba.com>
  0 siblings, 1 reply; 10+ messages in thread
From: Luming Yu @ 2021-09-09 14:51 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: wujinhua, x86, zelin.deng, jiayu.ni, ak, luming.yu, fan.du,
	artie.ding, tony.luck, tglx, linux-kernel, pawan.kumar.gupta,
	fenghua.yu, hpa, ricardo.neri-calderon, peterz

On Thu, Sep 9, 2021 at 7:08 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, Sep 09, 2021 at 07:02:08PM +0800, wujinhua wrote:
> > I provide three tests and result for you. All the tests run in Intel
> > CPX.
>
> I said
>
> "What you should do is show the extensive tests you've run with
> real-world benchmarks..."
>
> Are your tests real-world benchmarks?
>
> IOW, no microbenchmarks please.

In theory, I thought it should help system boot time , app/dock launch
time as well
as some customer specific marco benchmarks.

>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <1cac1499-6b00-3c18-b64c-a22f269a2706@linux.alibaba.com>]

* Re: 回复：[PATCH] perf: optimize clear page in Intel specified model with movq instruction
       [not found]         ` <1cac1499-6b00-3c18-b64c-a22f269a2706@linux.alibaba.com>
@ 2021-10-16 13:22           ` Borislav Petkov
       [not found]             ` <7b07f141-12f5-397d-9e45-1d507cacae84@linux.alibaba.com>
  0 siblings, 1 reply; 10+ messages in thread
From: Borislav Petkov @ 2021-10-16 13:22 UTC (permalink / raw)
  To: JY Ni
  Cc: Luming Yu, wujinhua, x86, zelin.deng, ak, luming.yu, fan.du,
	artie.ding, tony.luck, tglx, linux-kernel, pawan.kumar.gupta,
	fenghua.yu, hpa, ricardo.neri-calderon, peterz

On Sat, Oct 16, 2021 at 08:58:32PM +0800, JY Ni wrote:
> I rebased this patch on linux-next repo and measured the time of building a
> same kernel in original/add-movq-patch version on the same intel CPX server.

...

> delta = (*build_original_time* - *build_movq_time*) / (*build_movq_time*)
> 
> This set of data shows that movq-patch version has better performance than
> original version in most cases.

First of all, please do not top-post but put your answer under the
quoted text like everyone else.

Then, please explain how exactly you ran that measurement so that I can
try to reproduce it here too.

And just to make sure we're talking about the same thing: the patch in
question is this one:

https://lore.kernel.org/r/1631177151-53723-1-git-send-email-wujinhua@linux.alibaba.com

correct?

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <7b07f141-12f5-397d-9e45-1d507cacae84@linux.alibaba.com>]

* Re: 回复：[PATCH] perf: optimize clear page in Intel specified model with movq instruction
       [not found]             ` <7b07f141-12f5-397d-9e45-1d507cacae84@linux.alibaba.com>
@ 2021-10-18 12:43               ` Borislav Petkov
  2021-10-18 14:44                 ` Luming Yu
  0 siblings, 1 reply; 10+ messages in thread
From: Borislav Petkov @ 2021-10-18 12:43 UTC (permalink / raw)
  To: JY Ni
  Cc: Luming Yu, wujinhua, x86, zelin.deng, ak, luming.yu, fan.du,
	artie.ding, tony.luck, tglx, linux-kernel, pawan.kumar.gupta,
	fenghua.yu, hpa, ricardo.neri-calderon, peterz

On Mon, Oct 18, 2021 at 03:43:46PM +0800, JY Ni wrote:
> _*Precondition:*__*do tests on a Intel CPX server.*_ CPU information of my
> test machine is in backup part._*

My machine:

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 106
stepping        : 4

That's a SKYLAKE_X.

I ran

./tools/perf/perf stat --repeat 5 --sync --pre=/root/bin/pre-build-kernel.sh -- make -s -j96 bzImage

on -rc6, building allmodconfig each of the 10 times.

pre-build-kernel.sh is

---
#!/bin/bash

make -s clean
echo 3 > /proc/sys/vm/drop_caches
---

Results are below but to me that's all "in the noise" with around one
percent if I can trust the stddev. Which is not even close to 40%.

So basically you're wasting your time.

5.15-rc6
--------

# ./tools/perf/perf stat --repeat 5 --sync --pre=/root/bin/pre-build-kernel.sh -- make -s -j96 bzImage

 Performance counter stats for 'make -s -j96 bzImage' (5 runs):

      3,072,392.92 msec task-clock                #   51.109 CPUs utilized            ( +-  0.05% )
         1,351,534      context-switches          #  440.257 /sec                     ( +-  0.99% )
           224,862      cpu-migrations            #   73.248 /sec                     ( +-  1.39% )
        85,073,723      page-faults               #   27.712 K/sec                    ( +-  0.01% )
 8,743,357,421,495      cycles                    #    2.848 GHz                      ( +-  0.06% )
 7,643,946,991,468      instructions              #    0.88  insn per cycle           ( +-  0.00% )
 1,705,128,638,240      branches                  #  555.440 M/sec                    ( +-  0.00% )
    37,637,576,027      branch-misses             #    2.21% of all branches          ( +-  0.03% )
22,511,903,971,150      slots                     #    7.333 G/sec                    ( +-  0.03% )
 7,377,211,958,188      topdown-retiring          #     32.5% retiring                ( +-  0.02% )
 3,145,247,374,138      topdown-bad-spec          #     13.9% bad speculation         ( +-  0.27% )
 8,018,664,899,041      topdown-fe-bound          #     35.2% frontend bound          ( +-  0.07% )
 4,167,103,609,622      topdown-be-bound          #     18.3% backend bound           ( +-  0.09% )

            60.114 +- 0.112 seconds time elapsed  ( +-  0.19% )



5.15-rc6 + patch
----------------

 Performance counter stats for 'make -s -j96 bzImage' (5 runs):

      3,033,250.65 msec task-clock                #   51.243 CPUs utilized            ( +-  0.05% )
         1,329,033      context-switches          #  438.210 /sec                     ( +-  0.64% )
           225,550      cpu-migrations            #   74.369 /sec                     ( +-  1.36% )
        85,080,938      page-faults               #   28.053 K/sec                    ( +-  0.00% )
 8,629,663,367,477      cycles                    #    2.845 GHz                      ( +-  0.05% )
 7,696,237,813,803      instructions              #    0.89  insn per cycle           ( +-  0.00% )
 1,709,909,494,107      branches                  #  563.793 M/sec                    ( +-  0.00% )
    37,719,552,337      branch-misses             #    2.21% of all branches          ( +-  0.02% )
22,214,249,023,820      slots                     #    7.325 G/sec                    ( +-  0.06% )
 7,412,342,725,008      topdown-retiring          #     33.0% retiring                ( +-  0.01% )
 3,141,090,408,028      topdown-bad-spec          #     14.1% bad speculation         ( +-  0.17% )
 7,996,077,873,517      topdown-fe-bound          #     35.6% frontend bound          ( +-  0.03% )
 3,862,154,886,962      topdown-be-bound          #     17.3% backend bound           ( +-  0.28% )

            59.193 +- 0.302 seconds time elapsed  ( +-  0.51% )

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: 回复：[PATCH] perf: optimize clear page in Intel specified model with movq instruction
  2021-10-18 12:43               ` Borislav Petkov
@ 2021-10-18 14:44                 ` Luming Yu
  0 siblings, 0 replies; 10+ messages in thread
From: Luming Yu @ 2021-10-18 14:44 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: JY Ni, wujinhua, x86, zelin.deng, ak, luming.yu, fan.du,
	artie.ding, tony.luck, tglx, linux-kernel, pawan.kumar.gupta,
	fenghua.yu, hpa, ricardo.neri-calderon, peterz

On Mon, Oct 18, 2021 at 8:43 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Mon, Oct 18, 2021 at 03:43:46PM +0800, JY Ni wrote:
> > _*Precondition:*__*do tests on a Intel CPX server.*_ CPU information of my
> > test machine is in backup part._*
>
> My machine:
>
> processor       : 0
> vendor_id       : GenuineIntel
> cpu family      : 6
> model           : 106
> stepping        : 4
>
> That's a SKYLAKE_X.
>
> I ran
>
> ./tools/perf/perf stat --repeat 5 --sync --pre=/root/bin/pre-build-kernel.sh -- make -s -j96 bzImage
>
> on -rc6, building allmodconfig each of the 10 times.
>
> pre-build-kernel.sh is
>
> ---
> #!/bin/bash
>
> make -s clean
> echo 3 > /proc/sys/vm/drop_caches
> ---
>
> Results are below but to me that's all "in the noise" with around one
> percent if I can trust the stddev. Which is not even close to 40%.
>
> So basically you're wasting your time.
>
> 5.15-rc6
> --------
>
> # ./tools/perf/perf stat --repeat 5 --sync --pre=/root/bin/pre-build-kernel.sh -- make -s -j96 bzImage
>
>  Performance counter stats for 'make -s -j96 bzImage' (5 runs):
>
>       3,072,392.92 msec task-clock                #   51.109 CPUs utilized            ( +-  0.05% )
>          1,351,534      context-switches          #  440.257 /sec                     ( +-  0.99% )
>            224,862      cpu-migrations            #   73.248 /sec                     ( +-  1.39% )
>         85,073,723      page-faults               #   27.712 K/sec                    ( +-  0.01% )
>  8,743,357,421,495      cycles                    #    2.848 GHz                      ( +-  0.06% )
>  7,643,946,991,468      instructions              #    0.88  insn per cycle           ( +-  0.00% )
>  1,705,128,638,240      branches                  #  555.440 M/sec                    ( +-  0.00% )
>     37,637,576,027      branch-misses             #    2.21% of all branches          ( +-  0.03% )
> 22,511,903,971,150      slots                     #    7.333 G/sec                    ( +-  0.03% )
>  7,377,211,958,188      topdown-retiring          #     32.5% retiring                ( +-  0.02% )
>  3,145,247,374,138      topdown-bad-spec          #     13.9% bad speculation         ( +-  0.27% )
>  8,018,664,899,041      topdown-fe-bound          #     35.2% frontend bound          ( +-  0.07% )
>  4,167,103,609,622      topdown-be-bound          #     18.3% backend bound           ( +-  0.09% )
>
>             60.114 +- 0.112 seconds time elapsed  ( +-  0.19% )
>
>
>
> 5.15-rc6 + patch
> ----------------
>
>  Performance counter stats for 'make -s -j96 bzImage' (5 runs):
>
>       3,033,250.65 msec task-clock                #   51.243 CPUs utilized            ( +-  0.05% )
>          1,329,033      context-switches          #  438.210 /sec                     ( +-  0.64% )
>            225,550      cpu-migrations            #   74.369 /sec                     ( +-  1.36% )
>         85,080,938      page-faults               #   28.053 K/sec                    ( +-  0.00% )
>  8,629,663,367,477      cycles                    #    2.845 GHz                      ( +-  0.05% )
>  7,696,237,813,803      instructions              #    0.89  insn per cycle           ( +-  0.00% )
>  1,709,909,494,107      branches                  #  563.793 M/sec                    ( +-  0.00% )
>     37,719,552,337      branch-misses             #    2.21% of all branches          ( +-  0.02% )
> 22,214,249,023,820      slots                     #    7.325 G/sec                    ( +-  0.06% )
>  7,412,342,725,008      topdown-retiring          #     33.0% retiring                ( +-  0.01% )
>  3,141,090,408,028      topdown-bad-spec          #     14.1% bad speculation         ( +-  0.17% )
>  7,996,077,873,517      topdown-fe-bound          #     35.6% frontend bound          ( +-  0.03% )
>  3,862,154,886,962      topdown-be-bound          #     17.3% backend bound           ( +-  0.28% )
>
>             59.193 +- 0.302 seconds time elapsed  ( +-  0.51% )

I'm trying to duplicate the difference and get noticed that  time && perf stat
might have a different scale view about the real time spent on the job.

And jiayu.ni's time diff showed the best at 32 jobs and the worst at 96 jobs.

[linux-5.15-rc6]# time make -s bzImage -j96

real    1m8.922s
user    55m25.750s
sys     7m30.666s

[linux-5.15-rc6]# make -s clean

[linux-5.15-rc6]# perf stat  make -s bzImage -j96
..
      61.461679693 seconds time elapsed


    2756.927852000 seconds user
     369.365209000 seconds sys

If kbuild time that jiayu.ni has shared  is not a solid proof for the
optimization idea can be accepted,
we can try other clear_page heavy workloads.

>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-10-18 14:44 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-09-09  8:45 [PATCH] perf: optimize clear page in Intel specified model with movq instruction Jinhua Wu
2021-09-09  9:39 ` Borislav Petkov
2021-09-09 10:34   ` Luming Yu
2021-09-09 10:44     ` Borislav Petkov
2021-09-09 11:18     ` Peter Zijlstra
     [not found]   ` <bf6fe59d-c760-40d4-8201-4170cd90ffc3.wujinhua@linux.alibaba.com>
2021-09-09 11:07     ` 回复：[PATCH] " Borislav Petkov
2021-09-09 14:51       ` Luming Yu
     [not found]         ` <1cac1499-6b00-3c18-b64c-a22f269a2706@linux.alibaba.com>
2021-10-16 13:22           ` Borislav Petkov
     [not found]             ` <7b07f141-12f5-397d-9e45-1d507cacae84@linux.alibaba.com>
2021-10-18 12:43               ` Borislav Petkov
2021-10-18 14:44                 ` Luming Yu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox