From: Alexey Dobriyan <adobriyan@gmail.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com
Cc: linux-kernel@vger.kernel.org, x86@vger.kernel.org,
linux@rasmusvillemoes.dk, torvalds@linux-foundation.org
Subject: [PATCH] x86_64: new and improved memset()
Date: Sat, 14 Sep 2019 13:33:45 +0300 [thread overview]
Message-ID: <20190914103345.GA5856@avx2> (raw)
Current memset() implementation does silly things:
* multiplication to get register-wide constant:
waste of cycles if filler is known at compile time,
* REP STOSQ followed by REP STOSB:
REP STOSB setup overhead is very high because trailing length
is very low (< 8)
* suboptimal calling convention:
REP STOSB/STOSQ favours (rdi, rcx), ABI gives (rdi, rsi, rdx).
While shuffling registers is free, rcx and rdx are equivalent
code generation wise.
* memset_orig():
memset(..., 0, ...) could be done within 3 registers,
memset(..., != 0, ...) -- within 4 registers, anything else is
a waste. CPUs which required unrolling are hopefully gone by now.
New implementation is based on the following observations:
* c == 0 is the most common form,
filler can be done with "xor eax, eax" and pushed into memset()
saving 2 bytes per call and multiplication
* "len" divisible by 8 is the most common form:
all it takes is one pointer or unsigned long inside structure,
dispatch at compile time to code without those ugly "lets fill
at most 7 bytes" tails,
* multiplication to get wider filler value can be done at compile time
for "c != 0" with 1 insn/10 bytes at most saving multiplication.
Note: "memset0" name is chosen because "bzero" is officially deprecated.
Note: memset(,0,) form is interleaved into memset(,c,) form to save space.
TODO:
CONFIG_FORTIFY_SOURCE is enabled by distros
inline "xor eax, eax; rep stosb"
benchmarks
testing
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
arch/x86/boot/compressed/Makefile | 1
arch/x86/include/asm/string_64.h | 104 ++++++++++++++++++++++++++++++++++
arch/x86/lib/Makefile | 1
arch/x86/lib/memset0_64.S | 86 ++++++++++++++++++++++++++++
drivers/firmware/efi/libstub/Makefile | 2
5 files changed, 193 insertions(+), 1 deletion(-)
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -38,6 +38,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
KBUILD_CFLAGS += -Wno-pointer-sign
+KBUILD_CFLAGS += -D_ARCH_X86_BOOT
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -15,7 +15,111 @@ extern void *memcpy(void *to, const void *from, size_t len);
extern void *__memcpy(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
+#if defined(_ARCH_X86_BOOT) || defined(CONFIG_FORTIFY_SOURCE)
void *memset(void *s, int c, size_t n);
+#else
+#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
+
+/* Internal, do not use. */
+static __always_inline void memset0(void *s, size_t n)
+{
+ /* Internal, do not use. */
+ void _memset0_mov(void);
+ void _memset0_rep_stosq(void);
+ void memset0_mov(void);
+ void memset0_rep_stosq(void);
+ void memset0_rep_stosb(void);
+
+ if (__builtin_constant_p(n) && n == 0) {
+ } else if (__builtin_constant_p(n) && n == 1) {
+ *(uint8_t *)s = 0;
+ } else if (__builtin_constant_p(n) && n == 2) {
+ *(uint16_t *)s = 0;
+ } else if (__builtin_constant_p(n) && n == 4) {
+ *(uint32_t *)s = 0;
+ } else if (__builtin_constant_p(n) && n == 6) {
+ *(uint32_t *)s = 0;
+ *(uint16_t *)(s + 4) = 0;
+ } else if (__builtin_constant_p(n) && n == 8) {
+ *(uint64_t *)s = 0;
+ } else if (__builtin_constant_p(n) && (n & 7) == 0) {
+ alternative_call_2(
+ _memset0_mov,
+ _memset0_rep_stosq, X86_FEATURE_REP_GOOD,
+ memset0_rep_stosb, X86_FEATURE_ERMS,
+ ASM_OUTPUT2("=D" (s), "=c" (n)),
+ "D" (s), "c" (n)
+ : "rax", "cc", "memory"
+ );
+ } else {
+ alternative_call_2(
+ memset0_mov,
+ memset0_rep_stosq, X86_FEATURE_REP_GOOD,
+ memset0_rep_stosb, X86_FEATURE_ERMS,
+ ASM_OUTPUT2("=D" (s), "=c" (n)),
+ "D" (s), "c" (n)
+ : "rax", "rsi", "cc", "memory"
+ );
+ }
+}
+
+/* Internal, do not use. */
+static __always_inline void memsetx(void *s, int c, size_t n)
+{
+ /* Internal, do not use. */
+ void _memsetx_mov(void);
+ void _memsetx_rep_stosq(void);
+ void memsetx_mov(void);
+ void memsetx_rep_stosq(void);
+ void memsetx_rep_stosb(void);
+
+ const uint64_t ccc = (uint8_t)c * 0x0101010101010101ULL;
+
+ if (__builtin_constant_p(n) && n == 0) {
+ } else if (__builtin_constant_p(n) && n == 1) {
+ *(uint8_t *)s = ccc;
+ } else if (__builtin_constant_p(n) && n == 2) {
+ *(uint16_t *)s = ccc;
+ } else if (__builtin_constant_p(n) && n == 4) {
+ *(uint32_t *)s = ccc;
+ } else if (__builtin_constant_p(n) && n == 8) {
+ *(uint64_t *)s = ccc;
+ } else if (__builtin_constant_p(n) && (n & 7) == 0) {
+ alternative_call_2(
+ _memsetx_mov,
+ _memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
+ memsetx_rep_stosb, X86_FEATURE_ERMS,
+ ASM_OUTPUT2("=D" (s), "=c" (n)),
+ "D" (s), "c" (n), "a" (ccc)
+ : "cc", "memory"
+ );
+ } else {
+ alternative_call_2(
+ memsetx_mov,
+ memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
+ memsetx_rep_stosb, X86_FEATURE_ERMS,
+ ASM_OUTPUT2("=D" (s), "=c" (n)),
+ "D" (s), "c" (n), "a" (ccc)
+ : "rsi", "cc", "memory"
+ );
+ }
+}
+
+static __always_inline void *memset(void *s, int c, size_t n)
+{
+ if (__builtin_constant_p(c)) {
+ if (c == 0) {
+ memset0(s, n);
+ } else {
+ memsetx(s, c, n);
+ }
+ return s;
+ } else {
+ return __builtin_memset(s, c, n);
+ }
+}
+#endif
void *__memset(void *s, int c, size_t n);
#define __HAVE_ARCH_MEMSET16
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -58,6 +58,7 @@ else
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
+ lib-y += memset0_64.o
lib-y += copy_user_64.o
lib-y += cmpxchg16b_emu.o
endif
new file mode 100644
--- /dev/null
+++ b/arch/x86/lib/memset0_64.S
@@ -0,0 +1,86 @@
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+.intel_syntax noprefix
+
+ENTRY(memset0_rep_stosb)
+ xor eax, eax
+.globl memsetx_rep_stosb
+memsetx_rep_stosb:
+ rep stosb
+ ret
+ENDPROC(memset0_rep_stosb)
+ENDPROC(memsetx_rep_stosb)
+EXPORT_SYMBOL(memset0_rep_stosb)
+EXPORT_SYMBOL(memsetx_rep_stosb)
+
+ENTRY(_memset0_rep_stosq)
+ xor eax, eax
+.globl _memsetx_rep_stosq
+_memsetx_rep_stosq:
+ shr rcx, 3
+ rep stosq
+ ret
+ENDPROC(_memset0_rep_stosq)
+ENDPROC(_memsetx_rep_stosq)
+EXPORT_SYMBOL(_memset0_rep_stosq)
+EXPORT_SYMBOL(_memsetx_rep_stosq)
+
+ENTRY(memset0_rep_stosq)
+ xor eax, eax
+.globl memsetx_rep_stosq
+memsetx_rep_stosq:
+ lea rsi, [rdi + rcx]
+ shr rcx, 3
+ rep stosq
+ cmp rdi, rsi
+ je 1f
+2:
+ mov [rdi], al
+ add rdi, 1
+ cmp rdi, rsi
+ jne 2b
+1:
+ ret
+ENDPROC(memset0_rep_stosq)
+ENDPROC(memsetx_rep_stosq)
+EXPORT_SYMBOL(memset0_rep_stosq)
+EXPORT_SYMBOL(memsetx_rep_stosq)
+
+ENTRY(_memset0_mov)
+ xor eax, eax
+.globl _memsetx_mov
+_memsetx_mov:
+ add rcx, rdi
+ cmp rdi, rcx
+ je 1f
+2:
+ mov [rdi], rax
+ add rdi, 8
+ cmp rdi, rcx
+ jne 2b
+1:
+ ret
+ENDPROC(_memset0_mov)
+ENDPROC(_memsetx_mov)
+EXPORT_SYMBOL(_memset0_mov)
+EXPORT_SYMBOL(_memsetx_mov)
+
+ENTRY(memset0_mov)
+ xor eax, eax
+.globl memsetx_mov
+memsetx_mov:
+ lea rsi, [rdi + rcx]
+ cmp rdi, rsi
+ je 1f
+2:
+ mov [rdi], al
+ add rdi, 1
+ cmp rdi, rsi
+ jne 2b
+1:
+ ret
+ENDPROC(memset0_mov)
+ENDPROC(memsetx_mov)
+EXPORT_SYMBOL(memset0_mov)
+EXPORT_SYMBOL(memsetx_mov)
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
-D__NO_FORTIFY \
$(call cc-option,-ffreestanding) \
$(call cc-option,-fno-stack-protector) \
- -D__DISABLE_EXPORTS
+ -D__DISABLE_EXPORTS -D_ARCH_X86_BOOT
GCOV_PROFILE := n
KASAN_SANITIZE := n
next reply other threads:[~2019-09-14 10:33 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-09-14 10:33 Alexey Dobriyan [this message]
2019-09-14 11:37 ` [PATCH] x86_64: new and improved memset() Borislav Petkov
2019-09-14 15:15 ` Alexey Dobriyan
2019-09-16 7:54 ` kbuild test robot
2019-09-16 8:43 ` kbuild test robot
2019-09-16 14:18 ` David Laight
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190914103345.GA5856@avx2 \
--to=adobriyan@gmail.com \
--cc=bp@alien8.de \
--cc=hpa@zytor.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux@rasmusvillemoes.dk \
--cc=mingo@redhat.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
--cc=x86@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.