From: Alexey Dobriyan <adobriyan@gmail.com>
To: Borislav Petkov <bp@alien8.de>
Cc: tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com,
linux-kernel@vger.kernel.org, x86@vger.kernel.org,
linux@rasmusvillemoes.dk, torvalds@linux-foundation.org
Subject: Re: [PATCH] x86_64: new and improved memset()
Date: Sat, 14 Sep 2019 18:15:37 +0300 [thread overview]
Message-ID: <20190914151537.GA12068@avx2> (raw)
In-Reply-To: <20190914113717.GA28054@zn.tnic>
On Sat, Sep 14, 2019 at 01:37:17PM +0200, Borislav Petkov wrote:
> On Sat, Sep 14, 2019 at 01:33:45PM +0300, Alexey Dobriyan wrote:
> > --- a/arch/x86/include/asm/string_64.h
> > +++ b/arch/x86/include/asm/string_64.h
> > @@ -15,7 +15,111 @@ extern void *memcpy(void *to, const void *from, size_t len);
> > extern void *__memcpy(void *to, const void *from, size_t len);
> >
> > #define __HAVE_ARCH_MEMSET
> > +#if defined(_ARCH_X86_BOOT) || defined(CONFIG_FORTIFY_SOURCE)
> > void *memset(void *s, int c, size_t n);
> > +#else
> > +#include <asm/alternative.h>
> > +#include <asm/cpufeatures.h>
> > +
> > +/* Internal, do not use. */
> > +static __always_inline void memset0(void *s, size_t n)
> > +{
> > + /* Internal, do not use. */
> > + void _memset0_mov(void);
> > + void _memset0_rep_stosq(void);
> > + void memset0_mov(void);
> > + void memset0_rep_stosq(void);
> > + void memset0_rep_stosb(void);
> > +
> > + if (__builtin_constant_p(n) && n == 0) {
> > + } else if (__builtin_constant_p(n) && n == 1) {
> > + *(uint8_t *)s = 0;
> > + } else if (__builtin_constant_p(n) && n == 2) {
> > + *(uint16_t *)s = 0;
> > + } else if (__builtin_constant_p(n) && n == 4) {
> > + *(uint32_t *)s = 0;
> > + } else if (__builtin_constant_p(n) && n == 6) {
> > + *(uint32_t *)s = 0;
> > + *(uint16_t *)(s + 4) = 0;
> > + } else if (__builtin_constant_p(n) && n == 8) {
> > + *(uint64_t *)s = 0;
> > + } else if (__builtin_constant_p(n) && (n & 7) == 0) {
> > + alternative_call_2(
> > + _memset0_mov,
> > + _memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> > + memset0_rep_stosb, X86_FEATURE_ERMS,
> > + ASM_OUTPUT2("=D" (s), "=c" (n)),
> > + "D" (s), "c" (n)
> > + : "rax", "cc", "memory"
> > + );
> > + } else {
> > + alternative_call_2(
> > + memset0_mov,
> > + memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> > + memset0_rep_stosb, X86_FEATURE_ERMS,
> > + ASM_OUTPUT2("=D" (s), "=c" (n)),
> > + "D" (s), "c" (n)
> > + : "rax", "rsi", "cc", "memory"
> > + );
> > + }
> > +}
> > +
> > +/* Internal, do not use. */
> > +static __always_inline void memsetx(void *s, int c, size_t n)
> > +{
> > + /* Internal, do not use. */
> > + void _memsetx_mov(void);
> > + void _memsetx_rep_stosq(void);
> > + void memsetx_mov(void);
> > + void memsetx_rep_stosq(void);
> > + void memsetx_rep_stosb(void);
> > +
> > + const uint64_t ccc = (uint8_t)c * 0x0101010101010101ULL;
> > +
> > + if (__builtin_constant_p(n) && n == 0) {
> > + } else if (__builtin_constant_p(n) && n == 1) {
> > + *(uint8_t *)s = ccc;
> > + } else if (__builtin_constant_p(n) && n == 2) {
> > + *(uint16_t *)s = ccc;
> > + } else if (__builtin_constant_p(n) && n == 4) {
> > + *(uint32_t *)s = ccc;
> > + } else if (__builtin_constant_p(n) && n == 8) {
> > + *(uint64_t *)s = ccc;
> > + } else if (__builtin_constant_p(n) && (n & 7) == 0) {
> > + alternative_call_2(
> > + _memsetx_mov,
> > + _memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> > + memsetx_rep_stosb, X86_FEATURE_ERMS,
> > + ASM_OUTPUT2("=D" (s), "=c" (n)),
> > + "D" (s), "c" (n), "a" (ccc)
> > + : "cc", "memory"
> > + );
> > + } else {
> > + alternative_call_2(
> > + memsetx_mov,
> > + memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> > + memsetx_rep_stosb, X86_FEATURE_ERMS,
> > + ASM_OUTPUT2("=D" (s), "=c" (n)),
> > + "D" (s), "c" (n), "a" (ccc)
> > + : "rsi", "cc", "memory"
> > + );
> > + }
> > +}
> > +
> > +static __always_inline void *memset(void *s, int c, size_t n)
> > +{
> > + if (__builtin_constant_p(c)) {
> > + if (c == 0) {
> > + memset0(s, n);
> > + } else {
> > + memsetx(s, c, n);
> > + }
> > + return s;
> > + } else {
> > + return __builtin_memset(s, c, n);
> > + }
> > +}
>
> I'm willing to take something like that only when such complexity is
> justified by numbers. I.e., I'm much more inclined to capping it under
> 32 and 64 byte sizes and keeping it simple.
OK. Those small lengths were indeed annoying.
> > +ENTRY(_memset0_mov)
> > + xor eax, eax
> > +.globl _memsetx_mov
> > +_memsetx_mov:
> > + add rcx, rdi
> > + cmp rdi, rcx
> > + je 1f
> > +2:
> > + mov [rdi], rax
> > + add rdi, 8
> > + cmp rdi, rcx
> > + jne 2b
> > +1:
> > + ret
> > +ENDPROC(_memset0_mov)
> > +ENDPROC(_memsetx_mov)
> > +EXPORT_SYMBOL(_memset0_mov)
> > +EXPORT_SYMBOL(_memsetx_mov)
> > +
> > +ENTRY(memset0_mov)
> > + xor eax, eax
> > +.globl memsetx_mov
> > +memsetx_mov:
> > + lea rsi, [rdi + rcx]
> > + cmp rdi, rsi
> > + je 1f
> > +2:
> > + mov [rdi], al
> > + add rdi, 1
> > + cmp rdi, rsi
> > + jne 2b
> > +1:
> > + ret
>
> Say what now? Intel syntax? You must be joking...
It is the best thing in the x86 assembler universe.
> > +ENDPROC(memset0_mov)
> > +ENDPROC(memsetx_mov)
> > +EXPORT_SYMBOL(memset0_mov)
> > +EXPORT_SYMBOL(memsetx_mov)
>
> Too many exported symbols.
Those are technical exports. memset() remains the only developer-visible
interface.
> Again, I'll much more prefer a cleaner,
> smaller solution than one where readability suffers greatly at the
> expense of *maybe* getting a bit better performance.
Readability is red herring, I for one find AT&T syntax unreadable.
next prev parent reply other threads:[~2019-09-14 15:15 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-09-14 10:33 [PATCH] x86_64: new and improved memset() Alexey Dobriyan
2019-09-14 11:37 ` Borislav Petkov
2019-09-14 15:15 ` Alexey Dobriyan [this message]
2019-09-16 7:54 ` kbuild test robot
2019-09-16 8:43 ` kbuild test robot
2019-09-16 14:18 ` David Laight
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190914151537.GA12068@avx2 \
--to=adobriyan@gmail.com \
--cc=bp@alien8.de \
--cc=hpa@zytor.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux@rasmusvillemoes.dk \
--cc=mingo@redhat.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
--cc=x86@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.