* [PATCH 2/5] x86_64, -march=native: POPCNT support
2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
2019-07-22 21:12 ` Peter Zijlstra
2019-07-22 20:27 ` [PATCH 3/5] x86_64, -march=native: REP MOVSB support Alexey Dobriyan
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
To: tglx, mingo, bp, hpa
Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro,
michal.lkml
Detect POPCNT instruction support and inline hweigth*() functions
if it is supported by CPU.
Detect POPCNT at boot time and conditionally refuse to boot.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
arch/x86/include/asm/arch_hweight.h | 24 +++++++++++++++++++
arch/x86/include/asm/segment.h | 1 +
arch/x86/kernel/verify_cpu.S | 8 +++++++
arch/x86/lib/Makefile | 5 +++-
.../drm/i915/display/intel_display_power.c | 2 +-
drivers/misc/sgi-gru/grumain.c | 2 +-
fs/btrfs/tree-checker.c | 4 ++--
include/linux/bitops.h | 2 ++
lib/Makefile | 2 ++
scripts/kconfig/cpuid.c | 7 ++++++
scripts/march-native.sh | 2 ++
11 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index ba88edd0d58b..3797aa57baa5 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,6 +2,28 @@
#ifndef _ASM_X86_HWEIGHT_H
#define _ASM_X86_HWEIGHT_H
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+static inline unsigned int __arch_hweight64(uint64_t x)
+{
+ return __builtin_popcountll(x);
+}
+
+static inline unsigned int __arch_hweight32(uint32_t x)
+{
+ return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight16(uint16_t x)
+{
+ return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight8(uint8_t x)
+{
+ return __builtin_popcount(x);
+}
+#else
+
#include <asm/cpufeatures.h>
#ifdef CONFIG_64BIT
@@ -53,3 +75,5 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
#endif /* CONFIG_X86_32 */
#endif
+
+#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index ac3892920419..d314c6b9b632 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -4,6 +4,7 @@
#include <linux/const.h>
#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
/*
* Constructor for a conventional segment GDT (or LDT) entry.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a024c4f7ba56..a9be8904faa3 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -134,6 +134,14 @@ ENTRY(verify_cpu)
movl $1,%eax
ret
.Lverify_cpu_sse_ok:
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+ mov $1, %eax
+ cpuid
+ bt $23, %ecx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 5246db42de45..7dc0e71b0ef3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -40,7 +40,10 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RETPOLINE) += retpoline.o
-obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
+ obj-y += hweight.o
+endif
obj-y += iomem.o
ifeq ($(CONFIG_X86_32),y)
diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c
index c93ad512014c..9066105f2fea 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -1570,7 +1570,7 @@ static void print_power_domains(struct i915_power_domains *power_domains,
{
enum intel_display_power_domain domain;
- DRM_DEBUG_DRIVER("%s (%lu):\n", prefix, hweight64(mask));
+ DRM_DEBUG_DRIVER("%s (%u):\n", prefix, hweight64(mask));
for_each_power_domain(domain, mask)
DRM_DEBUG_DRIVER("%s use_count %d\n",
intel_display_power_domain_str(domain),
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 40ac59dd018c..30cfeeb28e74 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -833,7 +833,7 @@ void gru_steal_context(struct gru_thread_state *gts)
}
gru_dbg(grudev,
"stole gid %d, ctxnum %d from gts %p. Need cb %d, ds %d;"
- " avail cb %ld, ds %ld\n",
+ " avail cb %u, ds %u\n",
gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
hweight64(gru->gs_dsr_map));
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ccd5706199d7..2d33c6ae0e61 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -478,7 +478,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
flags = btrfs_block_group_flags(&bgi);
if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
block_group_err(leaf, slot,
-"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+"invalid profile flags, have 0x%llx (%u bits set) expect no more than 1 bit set",
flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
return -EUCLEAN;
@@ -491,7 +491,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
type != (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA)) {
block_group_err(leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+"invalid type, have 0x%llx (%u bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf074bce3eb3..655b120bba66 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -7,10 +7,12 @@
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
+#ifndef CONFIG_MARCH_NATIVE_POPCNT
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);
+#endif
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/lib/Makefile b/lib/Makefile
index 095601ce371d..32400f3a3328 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -114,7 +114,9 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
obj-y += logic_pio.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+endif
obj-$(CONFIG_BTREE) += btree.o
obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 81b292382e26..9efc0d9464d8 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -43,6 +43,8 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
);
}
+static bool popcnt = false;
+
static uint32_t eax0_max;
static void intel(void)
@@ -52,6 +54,10 @@ static void intel(void)
if (eax0_max >= 1) {
cpuid(1, &eax, &ecx, &edx, &ebx);
// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+ if (ecx & (1 << 23)) {
+ popcnt = true;
+ }
}
}
@@ -72,6 +78,7 @@ int main(int argc, char *argv[])
}
#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+ _(popcnt);
#undef _
return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 29a33c80b62b..c3059f93ed2b 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -41,6 +41,8 @@ COLLECT_GCC_OPTIONS=$(
)
echo "-march=native: $COLLECT_GCC_OPTIONS"
+"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
+
for i in $COLLECT_GCC_OPTIONS; do
case $i in
*/cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)
--
2.21.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
@ 2019-07-22 21:12 ` Peter Zijlstra
2019-07-22 21:15 ` Alexey Dobriyan
0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2019-07-22 21:12 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
yamada.masahiro, michal.lkml
On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> Detect POPCNT instruction support and inline hweigth*() functions
> if it is supported by CPU.
>
> Detect POPCNT at boot time and conditionally refuse to boot.
>
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
> arch/x86/include/asm/arch_hweight.h | 24 +++++++++++++++++++
> arch/x86/include/asm/segment.h | 1 +
> arch/x86/kernel/verify_cpu.S | 8 +++++++
> arch/x86/lib/Makefile | 5 +++-
> .../drm/i915/display/intel_display_power.c | 2 +-
> drivers/misc/sgi-gru/grumain.c | 2 +-
> fs/btrfs/tree-checker.c | 4 ++--
> include/linux/bitops.h | 2 ++
> lib/Makefile | 2 ++
> scripts/kconfig/cpuid.c | 7 ++++++
> scripts/march-native.sh | 2 ++
> 11 files changed, 54 insertions(+), 5 deletions(-)
*WHY* ?
AFAICT this just adds lines and complexity and wins aboslutely nothing.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
2019-07-22 21:12 ` Peter Zijlstra
@ 2019-07-22 21:15 ` Alexey Dobriyan
2019-07-22 21:27 ` Alexey Dobriyan
2019-07-23 7:20 ` Peter Zijlstra
0 siblings, 2 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 21:15 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
yamada.masahiro, michal.lkml
On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > Detect POPCNT instruction support and inline hweigth*() functions
> > if it is supported by CPU.
> >
> > Detect POPCNT at boot time and conditionally refuse to boot.
> >
> > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > ---
> > arch/x86/include/asm/arch_hweight.h | 24 +++++++++++++++++++
> > arch/x86/include/asm/segment.h | 1 +
> > arch/x86/kernel/verify_cpu.S | 8 +++++++
> > arch/x86/lib/Makefile | 5 +++-
> > .../drm/i915/display/intel_display_power.c | 2 +-
> > drivers/misc/sgi-gru/grumain.c | 2 +-
> > fs/btrfs/tree-checker.c | 4 ++--
> > include/linux/bitops.h | 2 ++
> > lib/Makefile | 2 ++
> > scripts/kconfig/cpuid.c | 7 ++++++
> > scripts/march-native.sh | 2 ++
> > 11 files changed, 54 insertions(+), 5 deletions(-)
>
> *WHY* ?
>
> AFAICT this just adds lines and complexity and wins aboslutely nothing.
If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
Additionally some CPUs (still?) have fake dependency on the destination,
so "popcnt rax, rdi" is suboptimal.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
2019-07-22 21:15 ` Alexey Dobriyan
@ 2019-07-22 21:27 ` Alexey Dobriyan
2019-07-23 7:20 ` Peter Zijlstra
1 sibling, 0 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 21:27 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
yamada.masahiro, michal.lkml
On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > Detect POPCNT instruction support and inline hweigth*() functions
> > > if it is supported by CPU.
> > >
> > > Detect POPCNT at boot time and conditionally refuse to boot.
> > >
> > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > ---
> > > arch/x86/include/asm/arch_hweight.h | 24 +++++++++++++++++++
> > > arch/x86/include/asm/segment.h | 1 +
> > > arch/x86/kernel/verify_cpu.S | 8 +++++++
> > > arch/x86/lib/Makefile | 5 +++-
> > > .../drm/i915/display/intel_display_power.c | 2 +-
> > > drivers/misc/sgi-gru/grumain.c | 2 +-
> > > fs/btrfs/tree-checker.c | 4 ++--
> > > include/linux/bitops.h | 2 ++
> > > lib/Makefile | 2 ++
> > > scripts/kconfig/cpuid.c | 7 ++++++
> > > scripts/march-native.sh | 2 ++
> > > 11 files changed, 54 insertions(+), 5 deletions(-)
> >
> > *WHY* ?
> >
> > AFAICT this just adds lines and complexity and wins aboslutely nothing.
>
> If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> Additionally some CPUs (still?) have fake dependency on the destination,
> so "popcnt rax, rdi" is suboptimal.
More general argument is that if -march=native is accepted, compiler will
generate new instructions which will throw #UD on CPUs which aren't
capable, so it doesn't make sense to _not_ go deeper and use all the
knowledge about current CPU.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
2019-07-22 21:15 ` Alexey Dobriyan
2019-07-22 21:27 ` Alexey Dobriyan
@ 2019-07-23 7:20 ` Peter Zijlstra
2019-07-23 20:04 ` Alexey Dobriyan
1 sibling, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2019-07-23 7:20 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
yamada.masahiro, michal.lkml
On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > Detect POPCNT instruction support and inline hweigth*() functions
> > > if it is supported by CPU.
> > >
> > > Detect POPCNT at boot time and conditionally refuse to boot.
> > >
> > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > ---
> > > arch/x86/include/asm/arch_hweight.h | 24 +++++++++++++++++++
> > > arch/x86/include/asm/segment.h | 1 +
> > > arch/x86/kernel/verify_cpu.S | 8 +++++++
> > > arch/x86/lib/Makefile | 5 +++-
> > > .../drm/i915/display/intel_display_power.c | 2 +-
> > > drivers/misc/sgi-gru/grumain.c | 2 +-
> > > fs/btrfs/tree-checker.c | 4 ++--
> > > include/linux/bitops.h | 2 ++
> > > lib/Makefile | 2 ++
> > > scripts/kconfig/cpuid.c | 7 ++++++
> > > scripts/march-native.sh | 2 ++
> > > 11 files changed, 54 insertions(+), 5 deletions(-)
> >
> > *WHY* ?
> >
> > AFAICT this just adds lines and complexity and wins aboslutely nothing.
>
> If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> Additionally some CPUs (still?) have fake dependency on the destination,
> so "popcnt rax, rdi" is suboptimal.
You completely forgot to mention any of that in your Changelog, also I
doubt you can find code where this makes a measurable difference. IOW, I
still doubt it makes any kind of sense.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
2019-07-23 7:20 ` Peter Zijlstra
@ 2019-07-23 20:04 ` Alexey Dobriyan
0 siblings, 0 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-23 20:04 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
yamada.masahiro, michal.lkml
On Tue, Jul 23, 2019 at 09:20:43AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> > On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > > Detect POPCNT instruction support and inline hweigth*() functions
> > > > if it is supported by CPU.
> > > >
> > > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > >
> > > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > > ---
> > > > arch/x86/include/asm/arch_hweight.h | 24 +++++++++++++++++++
> > > > arch/x86/include/asm/segment.h | 1 +
> > > > arch/x86/kernel/verify_cpu.S | 8 +++++++
> > > > arch/x86/lib/Makefile | 5 +++-
> > > > .../drm/i915/display/intel_display_power.c | 2 +-
> > > > drivers/misc/sgi-gru/grumain.c | 2 +-
> > > > fs/btrfs/tree-checker.c | 4 ++--
> > > > include/linux/bitops.h | 2 ++
> > > > lib/Makefile | 2 ++
> > > > scripts/kconfig/cpuid.c | 7 ++++++
> > > > scripts/march-native.sh | 2 ++
> > > > 11 files changed, 54 insertions(+), 5 deletions(-)
> > >
> > > *WHY* ?
> > >
> > > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> >
> > If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> > Additionally some CPUs (still?) have fake dependency on the destination,
> > so "popcnt rax, rdi" is suboptimal.
>
> You completely forgot to mention any of that in your Changelog, also I
> doubt you can find code where this makes a measurable difference. IOW, I
> still doubt it makes any kind of sense.
It saves some space, although not much. gcc likes to use 64-bit version
even where 32-bit version should suffice.
Regardless I found some problems with POPCNT patch, so hold off the
series.
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 3/5] x86_64, -march=native: REP MOVSB support
2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 5/5] x86_64, -march=native: MOVBE support Alexey Dobriyan
3 siblings, 0 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
To: tglx, mingo, bp, hpa
Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro,
michal.lkml
Detect fast REP MOVSB support and use it for page copying.
Inline copy_page(), this saves alternative entry and a function call
overhead which should hopefully improve code generation.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
Makefile | 3 +++
arch/x86/include/asm/page_64.h | 13 +++++++++++++
arch/x86/kernel/relocate_kernel_64.S | 15 +++++++++++++++
arch/x86/kernel/verify_cpu.S | 12 ++++++++++++
arch/x86/lib/Makefile | 5 ++++-
arch/x86/lib/memcpy_64.S | 13 +++++++++++++
arch/x86/platform/pvh/head.S | 4 ++++
scripts/kconfig/cpuid.c | 9 +++++++++
scripts/march-native.sh | 1 +
9 files changed, 74 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index 690f70afa74e..aa194c96d27c 100644
--- a/Makefile
+++ b/Makefile
@@ -609,6 +609,9 @@ endif
ifdef CONFIG_MARCH_NATIVE
KBUILD_CFLAGS += -march=native
endif
+ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
ifeq ($(KBUILD_EXTMOD),)
# Objects we will link into vmlinux / subdirs we need to visit
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..051da768273d 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -54,7 +54,20 @@ static inline void clear_page(void *page)
: "cc", "memory", "rax", "rcx");
}
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+static __always_inline void copy_page(void *to, void *from)
+{
+ uint32_t len = PAGE_SIZE;
+ asm volatile (
+ "rep movsb"
+ : "+D" (to), "+S" (from), "+c" (len)
+ :
+ : "memory"
+ );
+}
+#else
void copy_page(void *to, void *from);
+#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c51ccff5cd01..822f7a3d035a 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -266,18 +266,33 @@ swap_pages:
movq %rsi, %rax
movq %r10, %rdi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
movq %rax, %rdi
movq %rdx, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
movq %rdx, %rdi
movq %r10, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
lea PAGE_SIZE(%rax), %rsi
jmp 0b
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a9be8904faa3..57b41dafc592 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,6 +142,18 @@ ENTRY(verify_cpu)
jnc .Lverify_cpu_no_longmode
#endif
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ xor %eax, %eax
+ cpuid
+ cmp $7, %eax
+ jb .Lverify_cpu_no_longmode
+ mov $7, %eax
+ xor %ecx, %ecx
+ cpuid
+ bt $9, %ebx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7dc0e71b0ef3..fa24cc717fb1 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -59,7 +59,10 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += clear_page_64.o copy_page_64.o
+ lib-y += clear_page_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
+ lib-y += copy_page_64.o
+endif
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o
lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 92748660ba51..ab5b9662b348 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -17,6 +17,18 @@
.weak memcpy
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ mov %rdi, %rax
+ mov %rdx, %rcx
+ rep movsb
+ ret
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
+#else
/*
* memcpy - Copy a memory block.
*
@@ -183,6 +195,7 @@ ENTRY(memcpy_orig)
.Lend:
retq
ENDPROC(memcpy_orig)
+#endif
#ifndef CONFIG_UML
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 1f8825bbaffb..2737f3e8c021 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -64,9 +64,13 @@ ENTRY(pvh_start_xen)
mov $_pa(pvh_start_info), %edi
mov %ebx, %esi
mov _pa(pvh_start_info_sz), %ecx
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ rep movsb
+#else
shr $2,%ecx
rep
movsl
+#endif
mov $_pa(early_stack_end), %esp
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 9efc0d9464d8..2d78fba1dcc7 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -44,6 +44,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
}
static bool popcnt = false;
+static bool rep_movsb = false;
static uint32_t eax0_max;
@@ -59,6 +60,13 @@ static void intel(void)
popcnt = true;
}
}
+ if (eax0_max >= 7) {
+ cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+ if (ebx & (1 << 9))
+ rep_movsb = true;
+ }
}
int main(int argc, char *argv[])
@@ -79,6 +87,7 @@ int main(int argc, char *argv[])
#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
_(popcnt);
+ _(rep_movsb);
#undef _
return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index c3059f93ed2b..87f00cdb8e10 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -42,6 +42,7 @@ COLLECT_GCC_OPTIONS=$(
echo "-march=native: $COLLECT_GCC_OPTIONS"
"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
+"$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB"
for i in $COLLECT_GCC_OPTIONS; do
case $i in
--
2.21.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 4/5] x86_64, -march=native: REP STOSB support
2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 3/5] x86_64, -march=native: REP MOVSB support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 5/5] x86_64, -march=native: MOVBE support Alexey Dobriyan
3 siblings, 0 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
To: tglx, mingo, bp, hpa
Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro,
michal.lkml
Use REP STOSB everywhere if CPU advertises fast REP STOSB.
Gcc LOVES to unroll memset(), using -mmemset-strategy saves terabytes of
.text.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
Makefile | 3 +++
arch/x86/boot/compressed/head_64.S | 4 ++++
arch/x86/crypto/sha1_ssse3_asm.S | 7 ++++++-
arch/x86/include/asm/page_64.h | 13 +++++++++++++
arch/x86/kernel/verify_cpu.S | 2 +-
arch/x86/lib/Makefile | 2 ++
arch/x86/lib/memset_64.S | 15 +++++++++++++++
arch/x86/lib/usercopy_64.c | 16 +++++++++++++++-
scripts/kconfig/cpuid.c | 6 +++++-
scripts/march-native.sh | 1 +
10 files changed, 65 insertions(+), 4 deletions(-)
diff --git a/Makefile b/Makefile
index aa194c96d27c..31a6375d0e31 100644
--- a/Makefile
+++ b/Makefile
@@ -612,6 +612,9 @@ endif
ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
endif
+ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
ifeq ($(KBUILD_EXTMOD),)
# Objects we will link into vmlinux / subdirs we need to visit
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6233ae35d0d9..a350d265e8af 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -520,8 +520,12 @@ relocated:
leaq _bss(%rip), %rdi
leaq _ebss(%rip), %rcx
subq %rdi, %rcx
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ rep stosb
+#else
shrq $3, %rcx
rep stosq
+#endif
/*
* Do the extraction, and jump to the new kernel..
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 99c5b8c4dc38..c98f8f2aead6 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -90,10 +90,15 @@
SHA1_PIPELINED_MAIN_BODY
# cleanup workspace
- mov $8, %ecx
mov %rsp, %rdi
xor %eax, %eax
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ mov $64, %ecx
+ rep stosb
+#else
+ mov $8, %ecx
rep stosq
+#endif
mov %rbp, %rsp # deallocate workspace
pop %rbp
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 051da768273d..7654d5544e0b 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+static __always_inline void clear_page(void *page)
+{
+ uint32_t len = PAGE_SIZE;
+ asm volatile (
+ "rep stosb"
+ : "+D" (page), "+c" (len)
+ : "a" (0)
+ : "memory"
+ );
+}
+#else
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
@@ -53,6 +65,7 @@ static inline void clear_page(void *page)
"0" (page)
: "cc", "memory", "rax", "rcx");
}
+#endif
#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
static __always_inline void copy_page(void *to, void *from)
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 57b41dafc592..d3f3370e7dab 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,7 +142,7 @@ ENTRY(verify_cpu)
jnc .Lverify_cpu_no_longmode
#endif
-#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
xor %eax, %eax
cpuid
cmp $7, %eax
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index fa24cc717fb1..ed71e88cb859 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -59,7 +59,9 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
lib-y += clear_page_64.o
+endif
ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
lib-y += copy_page_64.o
endif
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9bc861c71e75..7786d1a65423 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -8,6 +8,20 @@
.weak memset
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ENTRY(memset)
+ENTRY(__memset)
+ mov %esi, %eax
+ mov %rdi, %rsi
+ mov %rdx, %rcx
+ rep stosb
+ mov %rsi, %rax
+ ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+#else
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -140,3 +154,4 @@ ENTRY(memset_orig)
jmp .Lafter_bad_alignment
.Lfinal:
ENDPROC(memset_orig)
+#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index fff28c6f73a2..a90779b12d89 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -16,11 +16,23 @@
unsigned long __clear_user(void __user *addr, unsigned long size)
{
- long __d0;
might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
stac();
+
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ asm volatile (
+ "0: rep stosb\n"
+ "1:\n"
+ _ASM_EXTABLE(0b,1b)
+ : "+D" (addr), "+c" (size)
+ : "a" (0)
+ : "memory"
+ );
+#else
+ {
+ long __d0;
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
@@ -42,6 +54,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
_ASM_EXTABLE_UA(1b, 2b)
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+ }
+#endif
clac();
return size;
}
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 2d78fba1dcc7..58d09bda61e5 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -45,6 +45,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
static bool popcnt = false;
static bool rep_movsb = false;
+static bool rep_stosb = false;
static uint32_t eax0_max;
@@ -64,8 +65,10 @@ static void intel(void)
cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
- if (ebx & (1 << 9))
+ if (ebx & (1 << 9)) {
rep_movsb = true;
+ rep_stosb = true;
+ }
}
}
@@ -88,6 +91,7 @@ int main(int argc, char *argv[])
#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
_(popcnt);
_(rep_movsb);
+ _(rep_stosb);
#undef _
return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 87f00cdb8e10..a41a15a64df4 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -43,6 +43,7 @@ echo "-march=native: $COLLECT_GCC_OPTIONS"
"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
"$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB"
+"$CPUID" rep_stosb && option "CONFIG_MARCH_NATIVE_REP_STOSB"
for i in $COLLECT_GCC_OPTIONS; do
case $i in
--
2.21.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 5/5] x86_64, -march=native: MOVBE support
2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
` (2 preceding siblings ...)
2019-07-22 20:27 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
3 siblings, 0 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
To: tglx, mingo, bp, hpa
Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro,
michal.lkml
Use MOVBE if it is available.
Internally MOVBE probably translates to MOV+BSWAP anyway, but who knows.
Do it because it is easy to do...
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
arch/x86/crypto/des3_ede-asm_64.S | 28 ++++++++++++++++++++++++++++
arch/x86/kernel/verify_cpu.S | 7 +++++++
scripts/kconfig/cpuid.c | 5 +++++
scripts/march-native.sh | 1 +
4 files changed, 41 insertions(+)
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 7fca43099a5f..2fd310e98b0b 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -150,6 +150,15 @@
#define dummy2(a, b) /*_*/
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+#define read_block(io, left, right) \
+ movbe (io), left##d; \
+ movbe 4(io), right##d;
+
+#define write_block(io, left, right) \
+ movbe left##d, (io); \
+ movbe right##d, 4(io);
+#else
#define read_block(io, left, right) \
movl (io), left##d; \
movl 4(io), right##d; \
@@ -161,6 +170,7 @@
bswapl right##d; \
movl left##d, (io); \
movl right##d, 4(io);
+#endif
ENTRY(des3_ede_x86_64_crypt_blk)
/* input:
@@ -434,6 +444,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
pushq %rsi /* dst */
/* load input */
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe 0 * 4(%rdx), RL0d;
+ movbe 1 * 4(%rdx), RR0d;
+ movbe 2 * 4(%rdx), RL1d;
+ movbe 3 * 4(%rdx), RR1d;
+ movbe 4 * 4(%rdx), RL2d;
+ movbe 5 * 4(%rdx), RR2d;
+#else
movl 0 * 4(%rdx), RL0d;
movl 1 * 4(%rdx), RR0d;
movl 2 * 4(%rdx), RL1d;
@@ -447,6 +465,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR1d;
bswapl RL2d;
bswapl RR2d;
+#endif
initial_permutation3(RL, RR);
@@ -507,6 +526,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
final_permutation3(RR, RL);
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe RR0d, 0 * 4(%rsi);
+ movbe RL0d, 1 * 4(%rsi);
+ movbe RR1d, 2 * 4(%rsi);
+ movbe RL1d, 3 * 4(%rsi);
+ movbe RR2d, 4 * 4(%rsi);
+ movbe RL2d, 5 * 4(%rsi);
+#else
bswapl RR0d;
bswapl RL0d;
bswapl RR1d;
@@ -521,6 +548,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
movl RL1d, 3 * 4(%rsi);
movl RR2d, 4 * 4(%rsi);
movl RL2d, 5 * 4(%rsi);
+#endif
popq %r15;
popq %r14;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index d3f3370e7dab..f8ff130edfb3 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,6 +142,13 @@ ENTRY(verify_cpu)
jnc .Lverify_cpu_no_longmode
#endif
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ mov $1, %eax
+ cpuid
+ bt $22, %ecx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
xor %eax, %eax
cpuid
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 58d09bda61e5..0da1142a59da 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -43,6 +43,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
);
}
+static bool movbe = false;
static bool popcnt = false;
static bool rep_movsb = false;
static bool rep_stosb = false;
@@ -57,6 +58,9 @@ static void intel(void)
cpuid(1, &eax, &ecx, &edx, &ebx);
// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+ if (ecx & (1 << 22)) {
+ movbe = true;
+ }
if (ecx & (1 << 23)) {
popcnt = true;
}
@@ -89,6 +93,7 @@ int main(int argc, char *argv[])
}
#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+ _(movbe);
_(popcnt);
_(rep_movsb);
_(rep_stosb);
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index a41a15a64df4..530bac22fa07 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -41,6 +41,7 @@ COLLECT_GCC_OPTIONS=$(
)
echo "-march=native: $COLLECT_GCC_OPTIONS"
+"$CPUID" movbe && option "CONFIG_MARCH_NATIVE_MOVBE"
"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
"$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB"
"$CPUID" rep_stosb && option "CONFIG_MARCH_NATIVE_REP_STOSB"
--
2.21.0
^ permalink raw reply related [flat|nested] 10+ messages in thread