From: Alexey Dobriyan <adobriyan@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: torvalds@linux-foundation.org
Subject: Linux 5.0-ad1: -march=native support
Date: Mon, 4 Mar 2019 20:19:16 +0300 [thread overview]
Message-ID: <20190304171916.GA1694@avx2> (raw)
-ad1 patchset adds support for compiling kernel with "-march=native"
compiler option optimizing kernel for the specific CPU. "-march=native"
has been available in userspace for a long time and is trivial to enable
in Gentoo specifically.
"-march=native" can be used for folks like me who compile kernels on
their home machines and never share binaries.
See the link for more information:
https://www.shlomifish.org/humour/by-others/funroll-loops/Gentoo-is-Rice.html
Requirements:
Intel CPU
x86_64 arch
Usage:
# apply -ad1 patchset
# copy regular kernel .config
# enable "-march=native" support in
# "Processor type and features"
# "Processor family"
# ensure CONFIG_MARCH_NATIVE is enabled
$ grep -e CONFIG_MARCH_NATIVE .config
# workaround kbuild race condition if "-j" is used
$ make syncconfig
# build the kernel
$ make ...
# install kernel, reboot into new kernel
# verify detected CONFIG_MARCH_NATIVE_* options
$ gzip -d </proc/config.gz | grep -e CONFIG_MARCH_NATIVE
Ping me over email of something breaks or horribly slow.
Original announcement:
https://marc.info/?l=linux-kernel&m=151268659328024&w=4
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
Makefile | 16 +++
arch/x86/Kconfig.cpu | 8 +
arch/x86/Makefile | 27 +++++-
arch/x86/boot/compressed/head_64.S | 4
arch/x86/crypto/des3_ede-asm_64.S | 28 ++++++
arch/x86/crypto/sha1_ssse3_asm.S | 7 +
arch/x86/include/asm/arch_hweight.h | 28 ++++++
arch/x86/include/asm/page_64.h | 26 ++++++
arch/x86/include/asm/segment.h | 1
arch/x86/kernel/relocate_kernel_64.S | 15 +++
arch/x86/kernel/verify_cpu.S | 27 ++++++
arch/x86/lib/Makefile | 12 ++
arch/x86/lib/memcpy_64.S | 13 +++
arch/x86/lib/memset_64.S | 15 +++
arch/x86/lib/usercopy_64.c | 16 +++
arch/x86/platform/pvh/head.S | 4
drivers/net/wireless/mediatek/mt76/mac80211.c | 2
include/linux/bitops.h | 2
lib/Makefile | 2
scripts/kconfig/.gitignore | 1
scripts/kconfig/Makefile | 7 +
scripts/kconfig/cpuid.c | 108 ++++++++++++++++++++++++++
scripts/march-native.sh | 74 +++++++++++++++++
23 files changed, 429 insertions(+), 14 deletions(-)
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 5
PATCHLEVEL = 0
SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -ad1
NAME = Shy Crocodile
# *DOCUMENTATION*
@@ -370,10 +370,10 @@ HOST_LFS_LIBS := $(shell getconf LFS_LIBS 2>/dev/null)
HOSTCC = gcc
HOSTCXX = g++
-KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 \
+KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -march=native -O2 \
-fomit-frame-pointer -std=gnu89 $(HOST_LFS_CFLAGS) \
$(HOSTCFLAGS)
-KBUILD_HOSTCXXFLAGS := -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
+KBUILD_HOSTCXXFLAGS := -march=native -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS)
KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS)
@@ -594,6 +594,16 @@ ifeq ($(dot-config),1)
include include/config/auto.conf
endif
+ifdef CONFIG_MARCH_NATIVE
+KBUILD_CFLAGS += -march=native
+endif
+ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
+ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
+
# The all: target is the default when no target is given on the
# command line.
# This allow a user to issue only 'make' to build a kernel including modules
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -287,6 +287,12 @@ config GENERIC_CPU
Generic x86-64 CPU.
Run equally well on all x86-64 CPUs.
+config MARCH_NATIVE
+ bool "-march=native"
+ depends on X86_64
+ ---help---
+ -march=native support.
+
endchoice
config X86_GENERIC
@@ -307,6 +313,7 @@ config X86_INTERNODE_CACHE_SHIFT
int
default "12" if X86_VSMP
default X86_L1_CACHE_SHIFT
+ depends on !MARCH_NATIVE
config X86_L1_CACHE_SHIFT
int
@@ -314,6 +321,7 @@ config X86_L1_CACHE_SHIFT
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if MELAN || M486 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+ depends on !MARCH_NATIVE
config X86_F00F_BUG
def_bool y
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -12,6 +12,28 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+CFLAGS_NO_FP :=
+CFLAGS_NO_FP += $(call cc-option,-mno-mmx,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse2,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse3,)
+CFLAGS_NO_FP += $(call cc-option,-mno-ssse3,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4a,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4.1,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4.2,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx2,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512f,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512pf,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512er,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512cd,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512vl,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512bw,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512dq,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512ifma,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512vbmi,)
+
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
@@ -34,7 +56,7 @@ M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
- -mno-mmx -mno-sse
+ $(CFLAGS_NO_FP)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
@@ -57,8 +79,7 @@ endif
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+KBUILD_CFLAGS += $(CFLAGS_NO_FP)
ifeq ($(CONFIG_X86_32),y)
BITS := 32
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -517,8 +517,12 @@ relocated:
leaq _bss(%rip), %rdi
leaq _ebss(%rip), %rcx
subq %rdi, %rcx
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ rep stosb
+#else
shrq $3, %rcx
rep stosq
+#endif
/*
* Do the extraction, and jump to the new kernel..
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -159,6 +159,15 @@
#define dummy2(a, b) /*_*/
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+#define read_block(io, left, right) \
+ movbe (io), left##d; \
+ movbe 4(io), right##d;
+
+#define write_block(io, left, right) \
+ movbe left##d, (io); \
+ movbe right##d, 4(io);
+#else
#define read_block(io, left, right) \
movl (io), left##d; \
movl 4(io), right##d; \
@@ -170,6 +179,7 @@
bswapl right##d; \
movl left##d, (io); \
movl right##d, 4(io);
+#endif
ENTRY(des3_ede_x86_64_crypt_blk)
/* input:
@@ -443,6 +453,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
pushq %rsi /* dst */
/* load input */
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe 0 * 4(%rdx), RL0d;
+ movbe 1 * 4(%rdx), RR0d;
+ movbe 2 * 4(%rdx), RL1d;
+ movbe 3 * 4(%rdx), RR1d;
+ movbe 4 * 4(%rdx), RL2d;
+ movbe 5 * 4(%rdx), RR2d;
+#else
movl 0 * 4(%rdx), RL0d;
movl 1 * 4(%rdx), RR0d;
movl 2 * 4(%rdx), RL1d;
@@ -456,6 +474,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR1d;
bswapl RL2d;
bswapl RR2d;
+#endif
initial_permutation3(RL, RR);
@@ -516,6 +535,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
final_permutation3(RR, RL);
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe RR0d, 0 * 4(%rsi);
+ movbe RL0d, 1 * 4(%rsi);
+ movbe RR1d, 2 * 4(%rsi);
+ movbe RL1d, 3 * 4(%rsi);
+ movbe RR2d, 4 * 4(%rsi);
+ movbe RL2d, 5 * 4(%rsi);
+#else
bswapl RR0d;
bswapl RL0d;
bswapl RR1d;
@@ -530,6 +557,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
movl RL1d, 3 * 4(%rsi);
movl RR2d, 4 * 4(%rsi);
movl RL2d, 5 * 4(%rsi);
+#endif
popq %r15;
popq %r14;
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -94,10 +94,15 @@
SHA1_PIPELINED_MAIN_BODY
# cleanup workspace
- mov $8, %ecx
mov %rsp, %rdi
xor %eax, %eax
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ mov $64, %ecx
+ rep stosb
+#else
+ mov $8, %ecx
rep stosq
+#endif
mov %rbp, %rsp # deallocate workspace
pop %rbp
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,6 +2,30 @@
#ifndef _ASM_X86_HWEIGHT_H
#define _ASM_X86_HWEIGHT_H
+#define __HAVE_ARCH_SW_HWEIGHT
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+static inline unsigned int __arch_hweight64(uint64_t x)
+{
+ return __builtin_popcountll(x);
+}
+
+static inline unsigned int __arch_hweight32(uint32_t x)
+{
+ return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight16(uint16_t x)
+{
+ return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight8(uint8_t x)
+{
+ return __builtin_popcount(x);
+}
+#else
+
#include <asm/cpufeatures.h>
#ifdef CONFIG_64BIT
@@ -12,8 +36,6 @@
#define REG_OUT "a"
#endif
-#define __HAVE_ARCH_SW_HWEIGHT
-
static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res;
@@ -55,3 +77,5 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
#endif /* CONFIG_X86_32 */
#endif
+
+#endif
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+static __always_inline void clear_page(void *page)
+{
+ uint32_t len = PAGE_SIZE;
+ asm volatile (
+ "rep stosb"
+ : "+D" (page), "+c" (len)
+ : "a" (0)
+ : "memory"
+ );
+}
+#else
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
@@ -53,8 +65,22 @@ static inline void clear_page(void *page)
"0" (page)
: "cc", "memory", "rax", "rcx");
}
+#endif
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+static __always_inline void copy_page(void *to, void *from)
+{
+ uint32_t len = PAGE_SIZE;
+ asm volatile (
+ "rep movsb"
+ : "+D" (to), "+S" (from), "+c" (len)
+ :
+ : "memory"
+ );
+}
+#else
void copy_page(void *to, void *from);
+#endif
#endif /* !__ASSEMBLY__ */
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -4,6 +4,7 @@
#include <linux/const.h>
#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
/*
* Constructor for a conventional segment GDT (or LDT) entry.
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -268,18 +268,33 @@ swap_pages:
movq %rsi, %rax
movq %r10, %rdi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
movq %rax, %rdi
movq %rdx, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
movq %rdx, %rdi
movq %r10, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
lea PAGE_SIZE(%rax), %rsi
jmp 0b
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -136,6 +136,33 @@ ENTRY(verify_cpu)
movl $1,%eax
ret
.Lverify_cpu_sse_ok:
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+ mov $1, %eax
+ cpuid
+ bt $23, %ecx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ mov $1, %eax
+ cpuid
+ bt $22, %ecx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
+#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
+ xor %eax, %eax
+ cpuid
+ cmp $7, %eax
+ jb .Lverify_cpu_no_longmode
+ mov $7, %eax
+ xor %ecx, %ecx
+ cpuid
+ bt $9, %ebx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
popf # Restore caller passed flags
xorl %eax, %eax
ret
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -29,7 +29,10 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RETPOLINE) += retpoline.o
-obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
+ obj-y += hweight.o
+endif
obj-y += iomem.o
ifeq ($(CONFIG_X86_32),y)
@@ -45,7 +48,12 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += clear_page_64.o copy_page_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
+ lib-y += clear_page_64.o
+endif
+ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
+ lib-y += copy_page_64.o
+endif
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o
lib-y += cmpxchg16b_emu.o
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -16,6 +16,18 @@
.weak memcpy
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ mov %rdi, %rax
+ mov %rdx, %rcx
+ rep movsb
+ ret
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
+#else
/*
* memcpy - Copy a memory block.
*
@@ -182,6 +194,7 @@ ENTRY(memcpy_orig)
.Lend:
retq
ENDPROC(memcpy_orig)
+#endif
#ifndef CONFIG_UML
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -8,6 +8,20 @@
.weak memset
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ENTRY(memset)
+ENTRY(__memset)
+ mov %esi, %eax
+ mov %rdi, %rsi
+ mov %rdx, %rcx
+ rep stosb
+ mov %rsi, %rax
+ ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+#else
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -140,3 +154,4 @@ ENTRY(memset_orig)
jmp .Lafter_bad_alignment
.Lfinal:
ENDPROC(memset_orig)
+#endif
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,11 +15,23 @@
unsigned long __clear_user(void __user *addr, unsigned long size)
{
- long __d0;
might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
stac();
+
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ asm volatile (
+ "0: rep stosb\n"
+ "1:\n"
+ _ASM_EXTABLE(0b,1b)
+ : "+D" (addr), "+c" (size)
+ : "a" (0)
+ : "memory"
+ );
+#else
+ {
+ long __d0;
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
@@ -41,6 +53,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
_ASM_EXTABLE_UA(1b, 2b)
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+ }
+#endif
clac();
return size;
}
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -64,9 +64,13 @@ ENTRY(pvh_start_xen)
mov $_pa(pvh_start_info), %edi
mov %ebx, %esi
mov _pa(pvh_start_info_sz), %ecx
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ rep movsb
+#else
shr $2,%ecx
rep
movsl
+#endif
mov $_pa(early_stack_end), %esp
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -124,7 +124,7 @@ static void mt76_init_stream_cap(struct mt76_dev *dev,
bool vht)
{
struct ieee80211_sta_ht_cap *ht_cap = &sband->ht_cap;
- int i, nstream = __sw_hweight8(dev->antenna_mask);
+ int i, nstream = hweight8(dev->antenna_mask);
struct ieee80211_sta_vht_cap *vht_cap;
u16 mcs_map = 0;
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -7,10 +7,12 @@
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
+#ifndef CONFIG_MARCH_NATIVE_POPCNT
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);
+#endif
/*
* Include this here because some architectures need generic_ffs/fls in
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -93,7 +93,9 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
obj-y += logic_pio.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+endif
obj-$(CONFIG_BTREE) += btree.o
obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
--- a/scripts/kconfig/.gitignore
+++ b/scripts/kconfig/.gitignore
@@ -8,6 +8,7 @@
# configuration programs
#
conf
+cpuid
mconf
nconf
qconf
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -65,8 +65,9 @@ simple-targets := oldconfig allnoconfig allyesconfig allmodconfig \
alldefconfig randconfig listnewconfig olddefconfig syncconfig
PHONY += $(simple-targets)
-$(simple-targets): $(obj)/conf
+$(simple-targets): $(obj)/conf $(obj)/cpuid
$< $(silent) --$@ $(Kconfig)
+ $(Q)$(srctree)/scripts/march-native.sh $(CC) $(obj)/cpuid
PHONY += savedefconfig defconfig
@@ -149,6 +150,10 @@ $(obj)/zconf.lex.o: $(obj)/zconf.tab.h
HOSTCFLAGS_zconf.lex.o := -I$(src)
HOSTCFLAGS_zconf.tab.o := -I$(src)
+# cpuid: -march=native, CONFIG_MARCH_NATIVE_* detection
+hostprogs-y += cpuid
+cpuid-objs := cpuid.o
+
# conf: Used for defconfig, oldconfig and related targets
hostprogs-y += conf
conf-objs := conf.o $(common-objs)
new file mode 100644
--- /dev/null
+++ b/scripts/kconfig/cpuid.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef __x86_64__
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static inline bool streq(const char *s1, const char *s2)
+{
+ return strcmp(s1, s2) == 0;
+}
+
+static inline void cpuid(uint32_t eax0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
+{
+ asm volatile (
+ "cpuid"
+ : "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
+ : "0" (eax0)
+ );
+}
+
+static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
+{
+ asm volatile (
+ "cpuid"
+ : "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
+ : "0" (eax0), "1" (ecx0)
+ );
+}
+
+static bool movbe = false;
+static bool popcnt = false;
+static bool rep_movsb = false;
+static bool rep_stosb = false;
+
+static uint32_t eax0_max;
+
+static void intel(void)
+{
+ uint32_t eax, ecx, edx, ebx;
+
+ if (eax0_max >= 1) {
+ cpuid(1, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+ if (ecx & (1 << 22))
+ movbe = true;
+ if (ecx & (1 << 23))
+ popcnt = true;
+ }
+ if (eax0_max >= 7) {
+ cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+ if (ebx & (1 << 9)) {
+ rep_movsb = true;
+ rep_stosb = true;
+ }
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ const char *opt = argv[1];
+ uint32_t eax, ecx, edx, ebx;
+
+ if (argc != 2)
+ return EXIT_FAILURE;
+
+ cpuid(0, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+ eax0_max = eax;
+
+ if (ecx == 0x6c65746e && edx == 0x49656e69 && ebx == 0x756e6547) {
+ intel();
+ }
+
+#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+ _(movbe);
+ _(popcnt);
+ _(rep_movsb);
+ _(rep_stosb);
+#undef _
+
+ return EXIT_FAILURE;
+}
+#else
+#include <stdlib.h>
+int main(void)
+{
+ return EXIT_FAILURE;
+}
+#endif
new file mode 100755
--- /dev/null
+++ b/scripts/march-native.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# Copyright (c) 2017-2019 Alexey Dobriyan <adobriyan@gmail.com>
+if test "$(uname -m)" != "x86_64"; then
+ exit 0
+fi
+
+CC="$1"
+CPUID="$2"
+CONFIG=".config"
+AUTOCONF1="include/config/auto.conf"
+AUTOCONF2="include/generated/autoconf.h"
+
+if ! grep -q -e '^CONFIG_MARCH_NATIVE=y$' "$CONFIG"; then
+ sed -i -e '/^CONFIG_MARCH_NATIVE/d' "$AUTOCONF1" "$AUTOCONF2" >/dev/null 2>&1
+ exit 0
+fi
+
+if ! "$CC" -march=native -x c -c -o /dev/null /dev/null >/dev/null 2>&1; then
+ echo >&2 "error: unsupported '-march=native' compiler option"
+ exit 1
+fi
+
+_option() {
+ echo "$1=$2" >>"$CONFIG"
+ echo "$1=$2" >>"$AUTOCONF1"
+ echo "#define $1 $2" >>"$AUTOCONF2"
+}
+
+option() {
+ echo "$1=y" >>"$CONFIG"
+ echo "$1=y" >>"$AUTOCONF1"
+ echo "#define $1 1" >>"$AUTOCONF2"
+}
+
+if test ! -f "$CONFIG" -o ! -f "$AUTOCONF1" -o ! -f "$AUTOCONF2"; then
+ exit 0
+fi
+
+COLLECT_GCC_OPTIONS=$(
+ "$CC" -march=native -v -E -x c -c /dev/null 2>&1 |\
+ sed -ne '/^COLLECT_GCC_OPTIONS=/{n;p}' |\
+ awk '{$1=$1};1'
+)
+echo "-march=native: $COLLECT_GCC_OPTIONS"
+_option "CONFIG_MARCH_NATIVE_CC_OPTIONS" "\"$COLLECT_GCC_OPTIONS\""
+
+"$CPUID" movbe && option "CONFIG_MARCH_NATIVE_MOVBE"
+"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
+"$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB"
+"$CPUID" rep_stosb && option "CONFIG_MARCH_NATIVE_REP_STOSB"
+
+for i in $COLLECT_GCC_OPTIONS; do
+ case $i in
+ */cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)
+ ;;
+
+ l1-cache-line-size=64)
+ _option "CONFIG_X86_L1_CACHE_SHIFT" 6
+ _option "CONFIG_X86_INTERNODE_CACHE_SHIFT" 6
+ ;;
+
+ l1-cache-size=*);;
+ l2-cache-size=*);;
+
+ -march=*);;
+ -mtune=*);;
+
+ -m*);;
+ -mno-*);;
+
+ *)
+ echo >&2 "warning: unexpected -march=native option '$i'"
+ esac
+done
next reply other threads:[~2019-03-04 17:19 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-03-04 17:19 Alexey Dobriyan [this message]
2019-03-27 15:29 ` Linux 5.0-ad1: -march=native support Pavel Machek
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190304171916.GA1694@avx2 \
--to=adobriyan@gmail.com \
--cc=linux-kernel@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.