* [PATCH] x86: clean up unnecessarily wide TEST insns
@ 2015-03-06 20:55 Denys Vlasenko
2015-03-06 21:05 ` Andy Lutomirski
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: Denys Vlasenko @ 2015-03-06 20:55 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Denys Vlasenko, Linus Torvalds, Steven Rostedt, Ingo Molnar,
Borislav Petkov, H. Peter Anvin, Andy Lutomirski, Oleg Nesterov,
Frederic Weisbecker, Will Drewry, Kees Cook, x86, linux-kernel
By the nature of TEST operation, it is often possible
to test a narrower part of the operand:
"testl $3, mem" -> "testb $3, mem",
"testq $3, %rcx" -> "testb $3, %cl"
This results in shorter insns, because TEST insn has no
sign-entending byte-immediate forms unlike other ALU ops.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ingo Molnar <mingo@kernel.org>
CC: Borislav Petkov <bp@alien8.de>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
---
Code changes in assembly are:
-48 f7 07 01 00 00 00 testq $0x1,(%rdi)
+f6 07 01 testb $0x1,(%rdi)
-48 f7 c1 01 00 00 00 test $0x1,%rcx
+f6 c1 01 test $0x1,%cl
-48 f7 c1 02 00 00 00 test $0x2,%rcx
+f6 c1 02 test $0x2,%cl
-41 f7 c2 01 00 00 00 test $0x1,%r10d
+41 f6 c2 01 test $0x1,%r10b
-48 f7 c1 04 00 00 00 test $0x4,%rcx
+f6 c1 04 test $0x4,%cl
-48 f7 c1 08 00 00 00 test $0x8,%rcx
+f6 c1 08 test $0x8,%cl
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/relocate_kernel_32.S | 8 ++++----
arch/x86/kernel/relocate_kernel_64.S | 8 ++++----
arch/x86/lib/checksum_32.S | 4 ++--
arch/x86/lib/csum-copy_64.S | 2 +-
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a..dc177bf 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -146,7 +146,7 @@ startup_64:
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
-1: testq $1, 0(%rdi)
+1: testb $1, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7..77630d5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
movl (%ebx), %ecx
addl $4, %ebx
1:
- testl $0x1, %ecx /* is it a destination page */
+ testb $0x1, %cl /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
- testl $0x2, %ecx /* is it an indirection page */
+ testb $0x2, %cl /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
- testl $0x4, %ecx /* is it the done indicator */
+ testb $0x4, %cl /* is it the done indicator */
jz 2f
jmp 3f
2:
- testl $0x8, %ecx /* is it the source indicator */
+ testb $0x8, %cl /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c69..04cb179 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -221,23 +221,23 @@ swap_pages:
movq (%rbx), %rcx
addq $8, %rbx
1:
- testq $0x1, %rcx /* is it a destination page? */
+ testb $0x1, %cl /* is it a destination page? */
jz 2f
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp 0b
2:
- testq $0x2, %rcx /* is it an indirection page? */
+ testb $0x2, %cl /* is it an indirection page? */
jz 2f
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp 0b
2:
- testq $0x4, %rcx /* is it the done indicator? */
+ testb $0x4, %cl /* is it the done indicator? */
jz 2f
jmp 3f
2:
- testq $0x8, %rcx /* is it the source indicator? */
+ testb $0x8, %cl /* is it the source indicator? */
jz 0b /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index c3b9953..9bc944a 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -125,7 +125,7 @@ ENTRY(csum_partial)
6: addl %ecx,%eax
adcl $0, %eax
7:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 8f
roll $8, %eax
8:
@@ -245,7 +245,7 @@ ENTRY(csum_partial)
addl %ebx,%eax
adcl $0,%eax
80:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 90f
roll $8, %eax
90:
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5f..9734182 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
/* handle last odd byte */
.Lhandle_1:
- testl $1, %r10d
+ testb $1, %r10b
jz .Lende
xorl %ebx, %ebx
source
--
1.8.1.4
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH] x86: clean up unnecessarily wide TEST insns 2015-03-06 20:55 [PATCH] x86: clean up unnecessarily wide TEST insns Denys Vlasenko @ 2015-03-06 21:05 ` Andy Lutomirski 2015-03-07 10:13 ` Ingo Molnar 2015-03-06 22:24 ` Andi Kleen 2015-03-07 16:43 ` [tip:x86/asm] x86/asm: Optimize unnecessarily wide TEST instructions tip-bot for Denys Vlasenko 2 siblings, 1 reply; 8+ messages in thread From: Andy Lutomirski @ 2015-03-06 21:05 UTC (permalink / raw) To: Denys Vlasenko Cc: H. Peter Anvin, Linus Torvalds, Steven Rostedt, Ingo Molnar, Borislav Petkov, H. Peter Anvin, Oleg Nesterov, Frederic Weisbecker, Will Drewry, Kees Cook, X86 ML, linux-kernel@vger.kernel.org On Fri, Mar 6, 2015 at 12:55 PM, Denys Vlasenko <dvlasenk@redhat.com> wrote: > By the nature of TEST operation, it is often possible > to test a narrower part of the operand: > "testl $3, mem" -> "testb $3, mem", > "testq $3, %rcx" -> "testb $3, %cl" > This results in shorter insns, because TEST insn has no > sign-entending byte-immediate forms unlike other ALU ops. > > Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> > CC: Linus Torvalds <torvalds@linux-foundation.org> > CC: Steven Rostedt <rostedt@goodmis.org> > CC: Ingo Molnar <mingo@kernel.org> > CC: Borislav Petkov <bp@alien8.de> > CC: "H. Peter Anvin" <hpa@zytor.com> > CC: Andy Lutomirski <luto@amacapital.net> > CC: Oleg Nesterov <oleg@redhat.com> > CC: Frederic Weisbecker <fweisbec@gmail.com> > CC: Will Drewry <wad@chromium.org> > CC: Kees Cook <keescook@chromium.org> > CC: x86@kernel.org > CC: linux-kernel@vger.kernel.org > --- > > Code changes in assembly are: > > -48 f7 07 01 00 00 00 testq $0x1,(%rdi) > +f6 07 01 testb $0x1,(%rdi) > -48 f7 c1 01 00 00 00 test $0x1,%rcx > +f6 c1 01 test $0x1,%cl > -48 f7 c1 02 00 00 00 test $0x2,%rcx > +f6 c1 02 test $0x2,%cl > -41 f7 c2 01 00 00 00 test $0x1,%r10d > +41 f6 c2 01 test $0x1,%r10b > -48 f7 c1 04 00 00 00 test $0x4,%rcx > +f6 c1 04 test $0x4,%cl > -48 f7 c1 08 00 00 00 test $0x8,%rcx > +f6 c1 08 test $0x8,%cl > > arch/x86/kernel/head_64.S | 2 +- > arch/x86/kernel/relocate_kernel_32.S | 8 ++++---- > arch/x86/kernel/relocate_kernel_64.S | 8 ++++---- > arch/x86/lib/checksum_32.S | 4 ++-- > arch/x86/lib/csum-copy_64.S | 2 +- Looks good to me. Ingo, should I queue this up? --Andy ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: clean up unnecessarily wide TEST insns 2015-03-06 21:05 ` Andy Lutomirski @ 2015-03-07 10:13 ` Ingo Molnar 0 siblings, 0 replies; 8+ messages in thread From: Ingo Molnar @ 2015-03-07 10:13 UTC (permalink / raw) To: Andy Lutomirski Cc: Denys Vlasenko, H. Peter Anvin, Linus Torvalds, Steven Rostedt, Borislav Petkov, H. Peter Anvin, Oleg Nesterov, Frederic Weisbecker, Will Drewry, Kees Cook, X86 ML, linux-kernel@vger.kernel.org * Andy Lutomirski <luto@amacapital.net> wrote: > > +f6 c1 04 test $0x4,%cl > > -48 f7 c1 08 00 00 00 test $0x8,%rcx > > +f6 c1 08 test $0x8,%cl > > > > arch/x86/kernel/head_64.S | 2 +- > > arch/x86/kernel/relocate_kernel_32.S | 8 ++++---- > > arch/x86/kernel/relocate_kernel_64.S | 8 ++++---- > > arch/x86/lib/checksum_32.S | 4 ++-- > > arch/x86/lib/csum-copy_64.S | 2 +- > > Looks good to me. Ingo, should I queue this up? No need, I picked it up now that the 32-bit crash of tip:x86/asm is fixed. I juiced up the changelog a bit. Will push it out after a bit of testing. Thanks, Ingo ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: clean up unnecessarily wide TEST insns 2015-03-06 20:55 [PATCH] x86: clean up unnecessarily wide TEST insns Denys Vlasenko 2015-03-06 21:05 ` Andy Lutomirski @ 2015-03-06 22:24 ` Andi Kleen 2015-03-06 22:33 ` Andy Lutomirski ` (2 more replies) 2015-03-07 16:43 ` [tip:x86/asm] x86/asm: Optimize unnecessarily wide TEST instructions tip-bot for Denys Vlasenko 2 siblings, 3 replies; 8+ messages in thread From: Andi Kleen @ 2015-03-06 22:24 UTC (permalink / raw) To: Denys Vlasenko Cc: H. Peter Anvin, Linus Torvalds, Steven Rostedt, Ingo Molnar, Borislav Petkov, H. Peter Anvin, Andy Lutomirski, Oleg Nesterov, Frederic Weisbecker, Will Drewry, Kees Cook, x86, linux-kernel Denys Vlasenko <dvlasenk@redhat.com> writes: > By the nature of TEST operation, it is often possible > to test a narrower part of the operand: > "testl $3, mem" -> "testb $3, mem", > "testq $3, %rcx" -> "testb $3, %cl" > This results in shorter insns, because TEST insn has no > sign-entending byte-immediate forms unlike other ALU ops. It also results in expensive LCP stalls. Please don't do it. If you feel the need to change instructions around like this read the optimization manuals first. -Andi -- ak@linux.intel.com -- Speaking for myself only ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: clean up unnecessarily wide TEST insns 2015-03-06 22:24 ` Andi Kleen @ 2015-03-06 22:33 ` Andy Lutomirski 2015-03-06 22:37 ` Denys Vlasenko 2015-03-06 22:54 ` Linus Torvalds 2 siblings, 0 replies; 8+ messages in thread From: Andy Lutomirski @ 2015-03-06 22:33 UTC (permalink / raw) To: Andi Kleen Cc: Denys Vlasenko, H. Peter Anvin, Linus Torvalds, Steven Rostedt, Ingo Molnar, Borislav Petkov, H. Peter Anvin, Oleg Nesterov, Frederic Weisbecker, Will Drewry, Kees Cook, X86 ML, linux-kernel@vger.kernel.org On Fri, Mar 6, 2015 at 2:24 PM, Andi Kleen <andi@firstfloor.org> wrote: > Denys Vlasenko <dvlasenk@redhat.com> writes: > >> By the nature of TEST operation, it is often possible >> to test a narrower part of the operand: >> "testl $3, mem" -> "testb $3, mem", >> "testq $3, %rcx" -> "testb $3, %cl" >> This results in shorter insns, because TEST insn has no >> sign-entending byte-immediate forms unlike other ALU ops. > > It also results in expensive LCP stalls. Please don't do it. > If you feel the need to change instructions around like this read > the optimization manuals first. Is this true for register access or just memory? --Andy > > -Andi > > -- > ak@linux.intel.com -- Speaking for myself only -- Andy Lutomirski AMA Capital Management, LLC ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: clean up unnecessarily wide TEST insns 2015-03-06 22:24 ` Andi Kleen 2015-03-06 22:33 ` Andy Lutomirski @ 2015-03-06 22:37 ` Denys Vlasenko 2015-03-06 22:54 ` Linus Torvalds 2 siblings, 0 replies; 8+ messages in thread From: Denys Vlasenko @ 2015-03-06 22:37 UTC (permalink / raw) To: Andi Kleen Cc: Denys Vlasenko, H. Peter Anvin, Linus Torvalds, Steven Rostedt, Ingo Molnar, Borislav Petkov, H. Peter Anvin, Andy Lutomirski, Oleg Nesterov, Frederic Weisbecker, Will Drewry, Kees Cook, X86 ML, Linux Kernel Mailing List On Fri, Mar 6, 2015 at 11:24 PM, Andi Kleen <andi@firstfloor.org> wrote: > Denys Vlasenko <dvlasenk@redhat.com> writes: > >> By the nature of TEST operation, it is often possible >> to test a narrower part of the operand: >> "testl $3, mem" -> "testb $3, mem", >> "testq $3, %rcx" -> "testb $3, %cl" >> This results in shorter insns, because TEST insn has no >> sign-entending byte-immediate forms unlike other ALU ops. > > It also results in expensive LCP stalls. Please don't do it. > If you feel the need to change instructions around like this read > the optimization manuals first. Length-changing prefix (LCP) stalls result from 0x66 prefix. (See https://software.intel.com/en-us/forums/topic/328256). Basically, LCP happens because adding 66 byte before this instruction: [test_opcode] [modrm] [imm32] changes it to [66] [test_opcode] [modrm] [imm16] where [imm16] has *different length* now: 2 bytes instead of 4. This confuses decoder. REX prefixes were carefully designed to almost never hit this case: adding REX prefix does not change instruction length except MOVABS and MOV [addr],RAX insn. My patch does not add optimizations which would use 0x66 prefix. ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: clean up unnecessarily wide TEST insns 2015-03-06 22:24 ` Andi Kleen 2015-03-06 22:33 ` Andy Lutomirski 2015-03-06 22:37 ` Denys Vlasenko @ 2015-03-06 22:54 ` Linus Torvalds 2 siblings, 0 replies; 8+ messages in thread From: Linus Torvalds @ 2015-03-06 22:54 UTC (permalink / raw) To: Andi Kleen Cc: Denys Vlasenko, H. Peter Anvin, Steven Rostedt, Ingo Molnar, Borislav Petkov, H. Peter Anvin, Andy Lutomirski, Oleg Nesterov, Frederic Weisbecker, Will Drewry, Kees Cook, the arch/x86 maintainers, Linux Kernel Mailing List On Fri, Mar 6, 2015 at 2:24 PM, Andi Kleen <andi@firstfloor.org> wrote: > > It also results in expensive LCP stalls. BS, Andi. That's only true for prefixed operations - ie 16-bit data or address overrides. There are no stalls from using 8-bit instruction forms. Now, changing from 64-bit or 32-bit 'test' instructions to 8-bit ones *could* cause problems if it ends up having forwarding issues, so that instead of just forwarding the result, you end up having to wait for it to be stable in the L1 cache (or possibly the register file). The forwarding from the store buffer is simplest and most reliable if the read is done at the exact same address and the exact same size as the write that gets forwarded. But that's true only if (a) the write was very recent and is still in the write queue. I'm not sure that's the case here anyway. (b) on at least most intel microarchitectures, you have to test a different byte than the lowest one (so forwarding a 64-bit write to a 8-bit read ends up working fine, as long as the 8-bit read is of the low 8 bits of the written data). a very similar issue *might* show up for registers too, not just memory writes, if you use 'testb' with a high-byte register (where instead of forwarding the value from the original producer it needs to go through the register file and then shifted). But it's mainly a problem for store buffers. But afaik, the way Denys changed the test instructions, neither of the above issues should be true. The real problem for store buffer forwarding tends to be "write 8 bits, read 32 bits". That can be really surprisingly expensive, because the read ends up having to wait until the write has hit the cacheline, and we might talk tens of cycles of latency here. But "write 32 bits, read the low 8 bits" *should* be fast on pretty much all x86 chips, afaik. Linus ^ permalink raw reply [flat|nested] 8+ messages in thread
* [tip:x86/asm] x86/asm: Optimize unnecessarily wide TEST instructions 2015-03-06 20:55 [PATCH] x86: clean up unnecessarily wide TEST insns Denys Vlasenko 2015-03-06 21:05 ` Andy Lutomirski 2015-03-06 22:24 ` Andi Kleen @ 2015-03-07 16:43 ` tip-bot for Denys Vlasenko 2 siblings, 0 replies; 8+ messages in thread From: tip-bot for Denys Vlasenko @ 2015-03-07 16:43 UTC (permalink / raw) To: linux-tip-commits Cc: tglx, bp, oleg, mingo, wad, linux-kernel, hpa, luto, torvalds, rostedt, dvlasenk, hpa, keescook, fweisbec Commit-ID: 3e1aa7cb59aff4b245b45e326fcdba1bf7f105c6 Gitweb: http://git.kernel.org/tip/3e1aa7cb59aff4b245b45e326fcdba1bf7f105c6 Author: Denys Vlasenko <dvlasenk@redhat.com> AuthorDate: Fri, 6 Mar 2015 21:55:32 +0100 Committer: Ingo Molnar <mingo@kernel.org> CommitDate: Sat, 7 Mar 2015 11:12:43 +0100 x86/asm: Optimize unnecessarily wide TEST instructions By the nature of the TEST operation, it is often possible to test a narrower part of the operand: "testl $3, mem" -> "testb $3, mem", "testq $3, %rcx" -> "testb $3, %cl" This results in shorter instructions, because the TEST instruction has no sign-entending byte-immediate forms unlike other ALU ops. Note that this change does not create any LCP (Length-Changing Prefix) stalls, which happen when adding a 0x66 prefix, which happens when 16-bit immediates are used, which changes such TEST instructions: [test_opcode] [modrm] [imm32] to: [0x66] [test_opcode] [modrm] [imm16] where [imm16] has a *different length* now: 2 bytes instead of 4. This confuses the decoder and slows down execution. REX prefixes were carefully designed to almost never hit this case: adding REX prefix does not change instruction length except MOVABS and MOV [addr],RAX instruction. This patch does not add instructions which would use a 0x66 prefix, code changes in assembly are: -48 f7 07 01 00 00 00 testq $0x1,(%rdi) +f6 07 01 testb $0x1,(%rdi) -48 f7 c1 01 00 00 00 test $0x1,%rcx +f6 c1 01 test $0x1,%cl -48 f7 c1 02 00 00 00 test $0x2,%rcx +f6 c1 02 test $0x2,%cl -41 f7 c2 01 00 00 00 test $0x1,%r10d +41 f6 c2 01 test $0x1,%r10b -48 f7 c1 04 00 00 00 test $0x4,%rcx +f6 c1 04 test $0x4,%cl -48 f7 c1 08 00 00 00 test $0x8,%rcx +f6 c1 08 test $0x8,%cl Linus further notes: "There are no stalls from using 8-bit instruction forms. Now, changing from 64-bit or 32-bit 'test' instructions to 8-bit ones *could* cause problems if it ends up having forwarding issues, so that instead of just forwarding the result, you end up having to wait for it to be stable in the L1 cache (or possibly the register file). The forwarding from the store buffer is simplest and most reliable if the read is done at the exact same address and the exact same size as the write that gets forwarded. But that's true only if: (a) the write was very recent and is still in the write queue. I'm not sure that's the case here anyway. (b) on at least most Intel microarchitectures, you have to test a different byte than the lowest one (so forwarding a 64-bit write to a 8-bit read ends up working fine, as long as the 8-bit read is of the low 8 bits of the written data). A very similar issue *might* show up for registers too, not just memory writes, if you use 'testb' with a high-byte register (where instead of forwarding the value from the original producer it needs to go through the register file and then shifted). But it's mainly a problem for store buffers. But afaik, the way Denys changed the test instructions, neither of the above issues should be true. The real problem for store buffer forwarding tends to be "write 8 bits, read 32 bits". That can be really surprisingly expensive, because the read ends up having to wait until the write has hit the cacheline, and we might talk tens of cycles of latency here. But "write 32 bits, read the low 8 bits" *should* be fast on pretty much all x86 chips, afaik." Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> Acked-by: Andy Lutomirski <luto@amacapital.net> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Drewry <wad@chromium.org> Link: http://lkml.kernel.org/r/1425675332-31576-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/relocate_kernel_32.S | 8 ++++---- arch/x86/kernel/relocate_kernel_64.S | 8 ++++---- arch/x86/lib/checksum_32.S | 4 ++-- arch/x86/lib/csum-copy_64.S | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9a09196..ae6588b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -146,7 +146,7 @@ startup_64: leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ -1: testq $1, 0(%rdi) +1: testb $1, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7..77630d5 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -226,23 +226,23 @@ swap_pages: movl (%ebx), %ecx addl $4, %ebx 1: - testl $0x1, %ecx /* is it a destination page */ + testb $0x1, %cl /* is it a destination page */ jz 2f movl %ecx, %edi andl $0xfffff000, %edi jmp 0b 2: - testl $0x2, %ecx /* is it an indirection page */ + testb $0x2, %cl /* is it an indirection page */ jz 2f movl %ecx, %ebx andl $0xfffff000, %ebx jmp 0b 2: - testl $0x4, %ecx /* is it the done indicator */ + testb $0x4, %cl /* is it the done indicator */ jz 2f jmp 3f 2: - testl $0x8, %ecx /* is it the source indicator */ + testb $0x8, %cl /* is it the source indicator */ jz 0b /* Ignore it otherwise */ movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c69..04cb179 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -221,23 +221,23 @@ swap_pages: movq (%rbx), %rcx addq $8, %rbx 1: - testq $0x1, %rcx /* is it a destination page? */ + testb $0x1, %cl /* is it a destination page? */ jz 2f movq %rcx, %rdi andq $0xfffffffffffff000, %rdi jmp 0b 2: - testq $0x2, %rcx /* is it an indirection page? */ + testb $0x2, %cl /* is it an indirection page? */ jz 2f movq %rcx, %rbx andq $0xfffffffffffff000, %rbx jmp 0b 2: - testq $0x4, %rcx /* is it the done indicator? */ + testb $0x4, %cl /* is it the done indicator? */ jz 2f jmp 3f 2: - testq $0x8, %rcx /* is it the source indicator? */ + testb $0x8, %cl /* is it the source indicator? */ jz 0b /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index c3b9953..9bc944a 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -125,7 +125,7 @@ ENTRY(csum_partial) 6: addl %ecx,%eax adcl $0, %eax 7: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 8f roll $8, %eax 8: @@ -245,7 +245,7 @@ ENTRY(csum_partial) addl %ebx,%eax adcl $0,%eax 80: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 90f roll $8, %eax 90: diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 2419d5f..9734182 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic) /* handle last odd byte */ .Lhandle_1: - testl $1, %r10d + testb $1, %r10b jz .Lende xorl %ebx, %ebx source ^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2015-03-07 16:44 UTC | newest] Thread overview: 8+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2015-03-06 20:55 [PATCH] x86: clean up unnecessarily wide TEST insns Denys Vlasenko 2015-03-06 21:05 ` Andy Lutomirski 2015-03-07 10:13 ` Ingo Molnar 2015-03-06 22:24 ` Andi Kleen 2015-03-06 22:33 ` Andy Lutomirski 2015-03-06 22:37 ` Denys Vlasenko 2015-03-06 22:54 ` Linus Torvalds 2015-03-07 16:43 ` [tip:x86/asm] x86/asm: Optimize unnecessarily wide TEST instructions tip-bot for Denys Vlasenko
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).