* [PATCH 1/2] x86: use alternatives for clear_user()
@ 2015-06-20 21:45 Alexey Dobriyan
2015-06-20 21:47 ` [PATCH 2/2] x86: fix incomplete clear by clear_user() Alexey Dobriyan
2015-06-21 6:52 ` [PATCH 1/2] x86: use alternatives for clear_user() Ingo Molnar
0 siblings, 2 replies; 5+ messages in thread
From: Alexey Dobriyan @ 2015-06-20 21:45 UTC (permalink / raw)
To: hpa, mingo; +Cc: x86, linux-kernel
Alternatives allow to pick faster code: REP STOSQ, REP STOSB or else.
Default to REP STOSQ (as memset() does).
Sticking printk() into clear_user() and booting Debian showed
that it is usually executed with 4-8 bytes of alignment and
~2000-4000 bytes of length. On this scale difference should be noticeable.
Also make __clear_user() more, shall we say, "modern". Remove storing zeroes
via zeroed register, replace INC/DEC with ADD/SUB for dependency breakage.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
arch/x86/lib/Makefile | 1
arch/x86/lib/clear_user_64.S | 93 +++++++++++++++++++++++++++++++++++++++++++
arch/x86/lib/usercopy_64.c | 35 ----------------
3 files changed, 95 insertions(+), 34 deletions(-)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o misc.o cmdline.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
+lib-$(CONFIG_X86_64) += clear_user_64.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
--- /dev/null
+++ b/arch/x86/lib/clear_user_64.S
@@ -0,0 +1,93 @@
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+#include <asm/smap.h>
+
+# unsigned long __clear_user(void __user *, unsigned long);
+ENTRY(__clear_user)
+ CFI_STARTPROC
+
+ ALTERNATIVE_2 "jmp __clear_user_movq", \
+ "", X86_FEATURE_REP_GOOD, \
+ "jmp __clear_user_rep_stosb", X86_FEATURE_ERMS
+
+ ASM_STAC
+ xor %eax, %eax
+ mov %rsi, %rcx
+ and $7, %esi
+ shr $3, %rcx
+1: rep stosq
+ mov %esi, %ecx
+2: rep stosb
+3:
+ mov %rcx, %rax
+ ASM_CLAC
+ ret
+
+ .section .fixup,"ax"
+4: lea (%rsi,%rcx,8),%rcx
+ jmp 3b
+ .previous
+
+ _ASM_EXTABLE(1b,4b)
+ _ASM_EXTABLE(2b,3b)
+
+ CFI_ENDPROC
+ENDPROC(__clear_user)
+
+ENTRY(__clear_user_movq)
+ CFI_STARTPROC
+
+ ASM_STAC
+ mov %rsi, %rcx
+ and $7, %esi
+ shr $3, %rcx
+ jz 2f
+ .p2align 4
+1:
+ movq $0, (%rdi)
+ add $8, %rdi
+ sub $1, %rcx
+ jnz 1b
+2:
+ mov %esi, %ecx
+ test %ecx, %ecx
+ jz 4f
+ .p2align 4
+3:
+ movb $0, (%rdi)
+ add $1, %rdi
+ sub $1, %ecx
+ jnz 3b
+4:
+ mov %rcx, %rax
+ ASM_CLAC
+ ret
+
+ .section .fixup,"ax"
+5: lea (%rsi,%rcx,8),%rcx
+ jmp 4b
+ .previous
+
+ _ASM_EXTABLE(1b,5b)
+ _ASM_EXTABLE(3b,4b)
+
+ CFI_ENDPROC
+ENDPROC(__clear_user_movq)
+
+ENTRY(__clear_user_rep_stosb)
+ CFI_STARTPROC
+
+ ASM_STAC
+ xor %eax, %eax
+ mov %rsi, %rcx
+1: rep stosb
+2:
+ mov %rcx, %rax
+ ASM_CLAC
+ ret
+
+ _ASM_EXTABLE(1b,2b)
+
+ CFI_ENDPROC
+ENDPROC(__clear_user_rep_stosb)
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -12,40 +12,6 @@
* Zero Userspace
*/
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
- long __d0;
- might_fault();
- /* no memory constraint because it doesn't change any memory gcc knows
- about */
- stac();
- asm volatile(
- " testq %[size8],%[size8]\n"
- " jz 4f\n"
- "0: movq %[zero],(%[dst])\n"
- " addq %[eight],%[dst]\n"
- " decl %%ecx ; jnz 0b\n"
- "4: movq %[size1],%%rcx\n"
- " testl %%ecx,%%ecx\n"
- " jz 2f\n"
- "1: movb %b[zero],(%[dst])\n"
- " incq %[dst]\n"
- " decl %%ecx ; jnz 1b\n"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: lea 0(%[size1],%[size8],8),%[size8]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(0b,3b)
- _ASM_EXTABLE(1b,2b)
- : [size8] "=&c"(size), [dst] "=&D" (__d0)
- : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
- [zero] "r" (0UL), [eight] "r" (8UL));
- clac();
- return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
unsigned long clear_user(void __user *to, unsigned long n)
{
if (access_ok(VERIFY_WRITE, to, n))
@@ -53,6 +19,7 @@ unsigned long clear_user(void __user *to, unsigned long n)
return n;
}
EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
{
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 2/2] x86: fix incomplete clear by clear_user()
2015-06-20 21:45 [PATCH 1/2] x86: use alternatives for clear_user() Alexey Dobriyan
@ 2015-06-20 21:47 ` Alexey Dobriyan
2015-06-21 6:55 ` Ingo Molnar
2015-06-21 6:52 ` [PATCH 1/2] x86: use alternatives for clear_user() Ingo Molnar
1 sibling, 1 reply; 5+ messages in thread
From: Alexey Dobriyan @ 2015-06-20 21:47 UTC (permalink / raw)
To: hpa, mingo; +Cc: x86, linux-kernel
clear_user() used MOVQ+MOVB and if MOVQ faults, code simply exits and
honestly returns remaining length. In case of unaligned area, unaligned
remainder would count towards return value (correctly) but not cleared
(lazy code at least):
clear_user(p + 4096 - 4, 8) = 8
No one would have noticed but REP MOVSB addition to clear_user()
repertoire creates a problem: REP MOVSB does everything correctly,
clears and counts to the last possible byte, but REP STOSQ and MOVQ
variants DO NOT:
MOVQ clear_user(p + 4096 - 4, 8) = 8
REP STOSQ clear_user(p + 4096 - 4, 8) = 8
REP STOSB clear_user(p + 4096 - 4, 8) = 4
Patch fixes incomplete clear on 32-bit and 64-bit REP STOSQ, MOVQ.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
arch/x86/lib/clear_user_64.S | 9 ++++++---
arch/x86/lib/usercopy_32.c | 3 ++-
2 files changed, 8 insertions(+), 4 deletions(-)
--- a/arch/x86/lib/clear_user_64.S
+++ b/arch/x86/lib/clear_user_64.S
@@ -26,7 +26,8 @@ ENTRY(__clear_user)
.section .fixup,"ax"
4: lea (%rsi,%rcx,8),%rcx
- jmp 3b
+ # Fill as much as possible with byte stores.
+ jmp 2b
.previous
_ASM_EXTABLE(1b,4b)
@@ -57,7 +58,8 @@ ENTRY(__clear_user_movq)
3:
movb $0, (%rdi)
add $1, %rdi
- sub $1, %ecx
+ # Unaligned area and 4GB+ tail after recovery require RCX here.
+ sub $1, %rcx
jnz 3b
4:
mov %rcx, %rax
@@ -66,7 +68,8 @@ ENTRY(__clear_user_movq)
.section .fixup,"ax"
5: lea (%rsi,%rcx,8),%rcx
- jmp 4b
+ # Fill as much as possible with byte stores.
+ jmp 3b
.previous
_ASM_EXTABLE(1b,5b)
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -49,7 +49,8 @@ do { \
"2: " ASM_CLAC "\n" \
".section .fixup,\"ax\"\n" \
"3: lea 0(%2,%0,4),%0\n" \
- " jmp 2b\n" \
+ /* Fill as much as possible with byte stores. */ \
+ " jmp 1b\n" \
".previous\n" \
_ASM_EXTABLE(0b,3b) \
_ASM_EXTABLE(1b,2b) \
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] x86: use alternatives for clear_user()
2015-06-20 21:45 [PATCH 1/2] x86: use alternatives for clear_user() Alexey Dobriyan
2015-06-20 21:47 ` [PATCH 2/2] x86: fix incomplete clear by clear_user() Alexey Dobriyan
@ 2015-06-21 6:52 ` Ingo Molnar
2015-06-22 12:18 ` Alexey Dobriyan
1 sibling, 1 reply; 5+ messages in thread
From: Ingo Molnar @ 2015-06-21 6:52 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: hpa, x86, linux-kernel, Andy Lutomirski, Borislav Petkov,
Brian Gerst, Denys Vlasenko, Thomas Gleixner, Peter Zijlstra
* Alexey Dobriyan <adobriyan@gmail.com> wrote:
> Alternatives allow to pick faster code: REP STOSQ, REP STOSB or else. Default to
> REP STOSQ (as memset() does).
>
> Sticking printk() into clear_user() and booting Debian showed that it is usually
> executed with 4-8 bytes of alignment and ~2000-4000 bytes of length. On this
> scale difference should be noticeable.
>
> Also make __clear_user() more, shall we say, "modern". Remove storing zeroes via
> zeroed register, replace INC/DEC with ADD/SUB for dependency breakage.
Yeah, so I like the direction of these changes (modulo the suggestions I make for
the patch, see further below), but note that our (new) policy with x86 assembly
patches is pretty brutal:
1)
This patch should be split into at least a series of 3 patches:
- patch #1 moves the old code into the new file
- patch #2 does the modernization of assembly ops to the old MOVQ method
- patch #3 adds the alternatives patching to introduce the STOSB and STOSQ
methods
2)
Please cite before/after /usr/bin/size comparisons of the .o (and vmlinux where
sensible) in the changelog (you can also do objdump -d comparisons), especially
for the first and second patch this will show that the move is an invariant, and
that the effects of the modernization of the MOVQ method.
3)
We require benchmark numbers for changes to such commonly used APIs: please add
clear_user() support to 'perf bench', as an extension to the existing memcpy and
memset benchmarks:
triton:~/tip> ls -l tools/perf/bench/mem-*x86*.S
-rw-rw-r-- 1 mingo mingo 413 Jun 18 22:53 tools/perf/bench/mem-memcpy-x86-64-asm.S
-rw-rw-r-- 1 mingo mingo 414 Jun 18 22:53 tools/perf/bench/mem-memset-x86-64-asm.S
You can run them with something like:
galatea:~> taskset 1 perf bench mem memset -o -i 10000 -l 1600 -r all
# Running 'mem/memset' benchmark:
Routine default (Default memset() provided by glibc)
# Copying 1600 Bytes ...
44.348694 GB/Sec (with prefault)
Routine x86-64-unrolled (unrolled memset() in arch/x86/lib/memset_64.S)
# Copying 1600 Bytes ...
24.548865 GB/Sec (with prefault)
Routine x86-64-stosq (movsq-based memset() in arch/x86/lib/memset_64.S)
# Copying 1600 Bytes ...
27.645939 GB/Sec (with prefault)
Routine x86-64-stosb (movsb-based memset() in arch/x86/lib/memset_64.S)
# Copying 1600 Bytes ...
47.760132 GB/Sec (with prefault)
You might want to test the specific length values that your printk profiling has
shown to be characteristic, via the -l <length> parameter, because some of these
routines are sensitive to alignment and length details.
Note that this kind of benchmarking matters: for example on this box where I ran
the above memset test it clearly shows that the STOSB method is superior.
I can help out with extending 'perf bench' with clear_user measurements if you'd
like.
> +# unsigned long __clear_user(void __user *, unsigned long);
> +ENTRY(__clear_user)
> + CFI_STARTPROC
There's no CFI_* in the x86 tree anymore, please send a patch against tip:master.
> +
> + ALTERNATIVE_2 "jmp __clear_user_movq", \
> + "", X86_FEATURE_REP_GOOD, \
> + "jmp __clear_user_rep_stosb", X86_FEATURE_ERMS
Can we move this into clear_user(), and patch in CALL instructions instead of
jumps? There's no reason to do these extra jumps.
> +
So for consistency's sake I'd put a label here that names the default function
__clear_user_rep_stosq. So that we know what it is when it shows up in 'perf top'.
(With the 'CALL' patching approach this would become __clear_user_rep_stosq in a
natural fashion - so in that case the extra label is not needed.)
> + ASM_STAC
> + xor %eax, %eax
> + mov %rsi, %rcx
> + and $7, %esi
> + shr $3, %rcx
> +1: rep stosq
> + mov %esi, %ecx
> +2: rep stosb
> +3:
> + mov %rcx, %rax
> + ASM_CLAC
> + ret
So I'd switch the ASM_CLAC with the MOV, because flags manipulation probably has
higher latency than a simple register move. (the same reason we do the STAC as the
first step)
> +ENTRY(__clear_user_movq)
> + CFI_STARTPROC
> +
> + ASM_STAC
With JMP patching: so every __clear_user_* method starts with a STAC, so I'd move
the STAC into the __clear_user function prologue, before the __clear_user_*
alternatives patching site - this might reduce flag value dependencies as well by
the time it's used.
With CALL patching: this can needs to be in the specific functions, like you have
it now.
> + mov %rsi, %rcx
> + and $7, %esi
> + shr $3, %rcx
> + jz 2f
> + .p2align 4
There's no need to align this small loop - the NOP only slows things down.
> +1:
> + movq $0, (%rdi)
> + add $8, %rdi
> + sub $1, %rcx
> + jnz 1b
> +2:
> + mov %esi, %ecx
> + test %ecx, %ecx
> + jz 4f
> + .p2align 4
Ditto.
> +3:
> + movb $0, (%rdi)
> + add $1, %rdi
> + sub $1, %ecx
> + jnz 3b
> +4:
> + mov %rcx, %rax
> + ASM_CLAC
I'd flip this one too.
> + ret
> +
> + .section .fixup,"ax"
> +5: lea (%rsi,%rcx,8),%rcx
> + jmp 4b
> + .previous
> +
> + _ASM_EXTABLE(1b,5b)
> + _ASM_EXTABLE(3b,4b)
> +
> + CFI_ENDPROC
> +ENDPROC(__clear_user_movq)
> +
> +ENTRY(__clear_user_rep_stosb)
> + CFI_STARTPROC
> +
> + ASM_STAC
> + xor %eax, %eax
> + mov %rsi, %rcx
> +1: rep stosb
> +2:
> + mov %rcx, %rax
> + ASM_CLAC
And I'd flip this one too.
> + ret
> +
> + _ASM_EXTABLE(1b,2b)
> +
> + CFI_ENDPROC
> +ENDPROC(__clear_user_rep_stosb)
> @@ -53,6 +19,7 @@ unsigned long clear_user(void __user *to, unsigned long n)
> return n;
> }
> EXPORT_SYMBOL(clear_user);
> +EXPORT_SYMBOL(__clear_user);
Also, please consider inlining clear_user()'s access_ok() check: so that the only
function left is __clear_user(). (if then that should be a separate patch)
Thanks,
Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] x86: fix incomplete clear by clear_user()
2015-06-20 21:47 ` [PATCH 2/2] x86: fix incomplete clear by clear_user() Alexey Dobriyan
@ 2015-06-21 6:55 ` Ingo Molnar
0 siblings, 0 replies; 5+ messages in thread
From: Ingo Molnar @ 2015-06-21 6:55 UTC (permalink / raw)
To: Alexey Dobriyan
Cc: hpa, x86, linux-kernel, Thomas Gleixner, Borislav Petkov,
Peter Zijlstra, Denys Vlasenko, Andy Lutomirski, Brian Gerst
* Alexey Dobriyan <adobriyan@gmail.com> wrote:
> clear_user() used MOVQ+MOVB and if MOVQ faults, code simply exits and
> honestly returns remaining length. In case of unaligned area, unaligned
> remainder would count towards return value (correctly) but not cleared
> (lazy code at least):
>
> clear_user(p + 4096 - 4, 8) = 8
>
> No one would have noticed but REP MOVSB addition to clear_user()
> repertoire creates a problem: REP MOVSB does everything correctly,
> clears and counts to the last possible byte, but REP STOSQ and MOVQ
> variants DO NOT:
>
> MOVQ clear_user(p + 4096 - 4, 8) = 8
> REP STOSQ clear_user(p + 4096 - 4, 8) = 8
> REP STOSB clear_user(p + 4096 - 4, 8) = 4
>
> Patch fixes incomplete clear on 32-bit and 64-bit REP STOSQ, MOVQ.
So please flip the order of the changes around so that we never have this
inconsistency observable: i.e. first update the existing clearing method, then
move it and introduce the new variants without having to patch them.
Thanks,
Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] x86: use alternatives for clear_user()
2015-06-21 6:52 ` [PATCH 1/2] x86: use alternatives for clear_user() Ingo Molnar
@ 2015-06-22 12:18 ` Alexey Dobriyan
0 siblings, 0 replies; 5+ messages in thread
From: Alexey Dobriyan @ 2015-06-22 12:18 UTC (permalink / raw)
To: Ingo Molnar
Cc: H. Peter Anvin, x86, Linux Kernel, Andy Lutomirski,
Borislav Petkov, Brian Gerst, Denys Vlasenko, Thomas Gleixner,
Peter Zijlstra
On Sun, Jun 21, 2015 at 9:52 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Alexey Dobriyan <adobriyan@gmail.com> wrote:
>
>> Alternatives allow to pick faster code: REP STOSQ, REP STOSB or else. Default to
>> REP STOSQ (as memset() does).
> 2)
>
> Please cite before/after /usr/bin/size comparisons of the .o (and vmlinux where
> sensible) in the changelog (you can also do objdump -d comparisons), especially
> for the first and second patch this will show that the move is an invariant, and
> that the effects of the modernization of the MOVQ method.
Difference is obviously negligible: ADD/SUB adds 1 byte compared to INC/DEC,
MOVQ 0 adds several.
>> +
>> + ALTERNATIVE_2 "jmp __clear_user_movq", \
>> + "", X86_FEATURE_REP_GOOD, \
>> + "jmp __clear_user_rep_stosb", X86_FEATURE_ERMS
>
> Can we move this into clear_user(), and patch in CALL instructions instead of
> jumps? There's no reason to do these extra jumps.
Yes, should be faster.
> So for consistency's sake I'd put a label here that names the default function
> __clear_user_rep_stosq. So that we know what it is when it shows up in 'perf top'.
>
> (With the 'CALL' patching approach this would become __clear_user_rep_stosq in a
> natural fashion - so in that case the extra label is not needed.)
>
>> + ASM_STAC
>> + xor %eax, %eax
>> + mov %rsi, %rcx
>> + and $7, %esi
>> + shr $3, %rcx
>> +1: rep stosq
>> + mov %esi, %ecx
>> +2: rep stosb
>> +3:
>> + mov %rcx, %rax
>> + ASM_CLAC
>> + ret
>
> So I'd switch the ASM_CLAC with the MOV, because flags manipulation probably has
> higher latency than a simple register move. (the same reason we do the STAC as the
> first step)
Maybe, it is written this way for symmetry, if STAC is first
instruction, CLAC is the last,
no mismatches possible.
>> + mov %rsi, %rcx
>> + and $7, %esi
>> + shr $3, %rcx
>> + jz 2f
>> + .p2align 4
>
> There's no need to align this small loop - the NOP only slows things down.
Alignment was copied from memset()/copy_from_user() loops, probably
doesn't matter.
>> EXPORT_SYMBOL(clear_user);
>> +EXPORT_SYMBOL(__clear_user);
>
> Also, please consider inlining clear_user()'s access_ok() check: so that the only
> function left is __clear_user(). (if then that should be a separate patch)
Inlining access_ok() will add ~30 bytes to every call site.
And there is might_fault() eyesore as well.
0000000000000050 <clear_user>:
50: 48 8d 14 37 lea (%rdi,%rsi,1),%rdx
54: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax
5b: 00 00
59: R_X86_64_32S cpu_tss+0x4
5d: 48 39 d6 cmp %rdx,%rsi
60: 48 8b 88 18 c0 ff ff mov -0x3fe8(%rax),%rcx
67: 48 89 f0 mov %rsi,%rax
6a: 77 0f ja 7b <clear_user+0x2b>
6c: 48 39 d1 cmp %rdx,%rcx
6f: 72 0a jb 7b <clear_user+0x2b>
71: 55 push %rbp
72: 48 89 e5 mov %rsp,%rbp
75: e8 00 00 00 00 callq 7a <clear_user+0x2a>
76: R_X86_64_PC32 __clear_user-0x4
7a: 5d pop %rbp
7b: c3 retq
I'll fix according to your suggestions and resend.
Alexey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2015-06-22 12:18 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-20 21:45 [PATCH 1/2] x86: use alternatives for clear_user() Alexey Dobriyan
2015-06-20 21:47 ` [PATCH 2/2] x86: fix incomplete clear by clear_user() Alexey Dobriyan
2015-06-21 6:55 ` Ingo Molnar
2015-06-21 6:52 ` [PATCH 1/2] x86: use alternatives for clear_user() Ingo Molnar
2015-06-22 12:18 ` Alexey Dobriyan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox