public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86: page: get_order() optimization
@ 2011-03-27  8:45 Maksym Planeta
  2011-03-27 11:33 ` Ingo Molnar
  2011-03-28 19:47 ` H. Peter Anvin
  0 siblings, 2 replies; 10+ messages in thread
From: Maksym Planeta @ 2011-03-27  8:45 UTC (permalink / raw)
  To: mingo; +Cc: kernel-janitors, namhyung, linux-kernel, Maksym Planeta

For x86 architecture get_order function can be optimized due to
assembler instruction bsr.

This is second version of patch where for constants gcc precompute the
result.

Signed-off-by: Maksym Planeta <mcsim.planeta@gmail.com>
---
 arch/x86/include/asm/getorder.h |   48 +++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/page.h     |    2 +-
 2 files changed, 49 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/include/asm/getorder.h

diff --git a/arch/x86/include/asm/getorder.h b/arch/x86/include/asm/getorder.h
new file mode 100644
index 0000000..b0c6f57
--- /dev/null
+++ b/arch/x86/include/asm/getorder.h
@@ -0,0 +1,48 @@
+#ifndef __ASM_GENERIC_GETORDER_H
+#define __ASM_GENERIC_GETORDER_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/compiler.h>
+
+#ifdef CONFIG_X86_CMOV
+#define ASM_CMOVZ(op, dest) \
+		"cmovzl %" #op ",%" #dest ";\n\t"
+#else
+#define ASM_CMOVZ(op, dest) \
+		"jnz 1f;\n\t" \
+		"movl %" #op ", %" #dest ";\n\t" \
+		"1: "
+#endif
+
+static __always_inline int __get_order(unsigned long size)
+{
+	int order;
+
+	size = (size - 1) >> (PAGE_SHIFT - 1);
+	asm("bsr %1, %0\n\t"
+		ASM_CMOVZ(2, 0)
+		: "=&r" (order) : "rm" (size), "rm" (0));
+	return order;
+}
+
+/* Pure 2^n version of get_order */
+static inline __attribute_const__  int get_order(unsigned long size)
+{
+	int order;
+
+	if (__builtin_constant_p(size)) {
+		size = (size - 1) >> (PAGE_SHIFT - 1);
+		order = -1;
+		do {
+			size >>= 1;
+			order++;
+		} while (size);
+		return order;
+	}
+	return __get_order(size);
+}
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* __ASM_GENERIC_GETORDER_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 8ca8283..10e4c45 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -63,7 +63,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
 #endif	/* __ASSEMBLY__ */
 
 #include <asm-generic/memory_model.h>
-#include <asm-generic/getorder.h>
+#include <asm/getorder.h>
 
 #define __HAVE_ARCH_GATE_AREA 1
 
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-27  8:45 [PATCH v2] x86: page: get_order() optimization Maksym Planeta
@ 2011-03-27 11:33 ` Ingo Molnar
  2011-03-27 16:22   ` Peter Hüwe
  2011-03-27 17:15   ` Maksym Planeta
  2011-03-28 19:47 ` H. Peter Anvin
  1 sibling, 2 replies; 10+ messages in thread
From: Ingo Molnar @ 2011-03-27 11:33 UTC (permalink / raw)
  To: Maksym Planeta; +Cc: mingo, kernel-janitors, namhyung, linux-kernel


* Maksym Planeta <mcsim.planeta@gmail.com> wrote:

> For x86 architecture get_order function can be optimized due to
> assembler instruction bsr.
> 
> This is second version of patch where for constants gcc precompute the
> result.
> 
> Signed-off-by: Maksym Planeta <mcsim.planeta@gmail.com>
> ---
>  arch/x86/include/asm/getorder.h |   48 +++++++++++++++++++++++++++++++++++++++
>  arch/x86/include/asm/page.h     |    2 +-
>  2 files changed, 49 insertions(+), 1 deletions(-)
>  create mode 100644 arch/x86/include/asm/getorder.h
> 
> diff --git a/arch/x86/include/asm/getorder.h b/arch/x86/include/asm/getorder.h
> new file mode 100644
> index 0000000..b0c6f57
> --- /dev/null
> +++ b/arch/x86/include/asm/getorder.h
> @@ -0,0 +1,48 @@
> +#ifndef __ASM_GENERIC_GETORDER_H
> +#define __ASM_GENERIC_GETORDER_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <linux/compiler.h>
> +
> +#ifdef CONFIG_X86_CMOV
> +#define ASM_CMOVZ(op, dest) \
> +		"cmovzl %" #op ",%" #dest ";\n\t"
> +#else
> +#define ASM_CMOVZ(op, dest) \
> +		"jnz 1f;\n\t" \
> +		"movl %" #op ", %" #dest ";\n\t" \
> +		"1: "
> +#endif
> +
> +static __always_inline int __get_order(unsigned long size)
> +{
> +	int order;
> +
> +	size = (size - 1) >> (PAGE_SHIFT - 1);
> +	asm("bsr %1, %0\n\t"
> +		ASM_CMOVZ(2, 0)
> +		: "=&r" (order) : "rm" (size), "rm" (0));
> +	return order;
> +}
> +
> +/* Pure 2^n version of get_order */
> +static inline __attribute_const__  int get_order(unsigned long size)
> +{
> +	int order;
> +
> +	if (__builtin_constant_p(size)) {
> +		size = (size - 1) >> (PAGE_SHIFT - 1);
> +		order = -1;
> +		do {
> +			size >>= 1;
> +			order++;
> +		} while (size);
> +		return order;
> +	}
> +	return __get_order(size);
> +}
> +
> +#endif	/* __ASSEMBLY__ */
> +
> +#endif	/* __ASM_GENERIC_GETORDER_H */
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 8ca8283..10e4c45 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -63,7 +63,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
>  #endif	/* __ASSEMBLY__ */
>  
>  #include <asm-generic/memory_model.h>
> -#include <asm-generic/getorder.h>
> +#include <asm/getorder.h>

Just wondering, what's the before/after 'size vmlinux' effect on a 'make 
defconfig' x86 kernel? Does the optimization make the kernel smaller as well, 
besides making it faster?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-27 11:33 ` Ingo Molnar
@ 2011-03-27 16:22   ` Peter Hüwe
  2011-03-27 17:15   ` Maksym Planeta
  1 sibling, 0 replies; 10+ messages in thread
From: Peter Hüwe @ 2011-03-27 16:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Maksym Planeta, mingo, kernel-janitors, namhyung, linux-kernel

Am Sonntag 27 März 2011, 13:33:23 schrieb Ingo Molnar:
> 
> Just wondering, what's the before/after 'size vmlinux' effect on a 'make
> defconfig' x86 kernel? Does the optimization make the kernel smaller as
> well, besides making it faster?
> 
> Thanks,
> 
> 	Ingo
Hi,

I compiled Linus' Tree (40471856) on my x86_64 machine, once without the patch 
and once with the patch.


size vmlinux-with*
   text    data     bss     dec     hex filename
7996901 1236036 1118208 10351145         9df229 vmlinux-withoutpatch
7996554 1235988 1118208 10350750         9df09e vmlinux-withpatch

ls -la vmlinux-with*
-rwxr-xr-x 1 peter users 18921237 27. Mär 17:40 vmlinux-withoutpatch
-rwxr-xr-x 1 peter users 18921333 27. Mär 17:58 vmlinux-withpatch



Peter 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-27 11:33 ` Ingo Molnar
  2011-03-27 16:22   ` Peter Hüwe
@ 2011-03-27 17:15   ` Maksym Planeta
  2011-03-28  5:08     ` Ingo Molnar
  1 sibling, 1 reply; 10+ messages in thread
From: Maksym Planeta @ 2011-03-27 17:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: mingo, kernel-janitors, namhyung, linux-kernel

On Sat, 27/03/2011 at 13:33 +0200, Ingo Molnar wrote:
> Just wondering, what's the before/after 'size vmlinux' effect on a 'make 
> defconfig' x86 kernel? Does the optimization make the kernel smaller as well, 
> besides making it faster?

Thank you for advice. I didn't really mentioned it. So without my patch:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
7915025	1253060	1122304	10290389	 9d04d5	vmlinux

And with it: 

size vmlinux
   text	   data	    bss	    dec	    hex	filename
7919150	1251364	1122304	10292818	 9d0e52	vmlinux

Size increased. But I discovered that if I replace "inline" with
"__always_inline" in get_order(), size will be following:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
7914481	1249252	1122304	10286037	 9cf3d5	vmlinux

And this is less than with same modification in asm-general:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
7914713	1249268	1122304	10286285	 9cf4cd	vmlinux

With my patch and "__always_inline" instead of just "inline" size will
be the smallest.

Signed-off-by: Maksym Planeta <mcsim.planeta@gmail.com>
---
 arch/x86/include/asm/getorder.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/getorder.h
b/arch/x86/include/asm/getorder.h
index b0c6f57..6220783 100644
--- a/arch/x86/include/asm/getorder.h
+++ b/arch/x86/include/asm/getorder.h
@@ -27,7 +27,7 @@ static __always_inline int __get_order(unsigned long
size)
 }
 
 /* Pure 2^n version of get_order */
-static inline __attribute_const__  int get_order(unsigned long size)
+static __always_inline __attribute_const__  int get_order(unsigned long
size)
 {
	int order;
-- 
Thanks,

Maksym Planeta


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-27 17:15   ` Maksym Planeta
@ 2011-03-28  5:08     ` Ingo Molnar
  2011-03-28 19:33       ` Maksym Planeta
  0 siblings, 1 reply; 10+ messages in thread
From: Ingo Molnar @ 2011-03-28  5:08 UTC (permalink / raw)
  To: Maksym Planeta
  Cc: mingo, kernel-janitors, namhyung, linux-kernel, Thomas Gleixner,
	H. Peter Anvin, Jan Beulich


* Maksym Planeta <mcsim.planeta@gmail.com> wrote:

> On Sat, 27/03/2011 at 13:33 +0200, Ingo Molnar wrote:
> > Just wondering, what's the before/after 'size vmlinux' effect on a 'make 
> > defconfig' x86 kernel? Does the optimization make the kernel smaller as well, 
> > besides making it faster?
> 
> Thank you for advice. I didn't really mentioned it. So without my patch:
> 
> size vmlinux
>    text	   data	    bss	    dec	    hex	filename
> 7915025	1253060	1122304	10290389	 9d04d5	vmlinux
> 
> And with it: 
> 
> size vmlinux
>    text	   data	    bss	    dec	    hex	filename
> 7919150	1251364	1122304	10292818	 9d0e52	vmlinux
> 
> Size increased. But I discovered that if I replace "inline" with
> "__always_inline" in get_order(), size will be following:
> 
> size vmlinux
>    text	   data	    bss	    dec	    hex	filename
> 7914481	1249252	1122304	10286037	 9cf3d5	vmlinux
> 
> And this is less than with same modification in asm-general:
> 
> size vmlinux
>    text	   data	    bss	    dec	    hex	filename
> 7914713	1249268	1122304	10286285	 9cf4cd	vmlinux
> 
> With my patch and "__always_inline" instead of just "inline" size will
> be the smallest.

Weird, that's an unexpected resut.

Have you looked at the disassembly, why does the size increase? I'd expect such 
a straight assembly optimization to result in smaller code: in the non-constant 
case it should be the same size as before, in the constant case it should be 
smaller, because BSR should be smaller than an open-coded search loop, right?

One sidenote, defconfig turns these on:

 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_OPTIMIZE_INLINING=y

And some versions of GCC arent very good with these.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-28  5:08     ` Ingo Molnar
@ 2011-03-28 19:33       ` Maksym Planeta
  2011-03-28 19:44         ` H. Peter Anvin
  0 siblings, 1 reply; 10+ messages in thread
From: Maksym Planeta @ 2011-03-28 19:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: mingo, kernel-janitors, namhyung, linux-kernel, Thomas Gleixner,
	H. Peter Anvin, Jan Beulich

On Mon, 28/03/2011 at 07:08 +0200, Ingo Molnar wrote:
> Have you looked at the disassembly, why does the size increase? I'd expect such 
> a straight assembly optimization to result in smaller code: in the non-constant 
> case it should be the same size as before, in the constant case it should be 
> smaller, because BSR should be smaller than an open-coded search loop, right?


Here is disassembly of patched get_order() with "inline" from
"kernel/kexec.c":

     a6c:       48 8b 5d c8             mov    -0x38(%rbp),%rbx
     a70:       e8 0b fd ff ff          callq  780 <get_order.clone.7>

0000000000000780 <get_order.clone.7>:
     780:       55                      push   %rbp
     781:       b8 01 00 00 00          mov    $0x1,%eax
     786:       48 89 e5                mov    %rsp,%rbp
     789:       c9                      leaveq 
     78a:       c3                      retq   

My version of gcc is gcc (Debian 4.5.2-4) 4.5.2, probably I should
upgrade my gcc version for better inline expansions.

-- 
Thanks,

Maksym Planeta


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-28 19:33       ` Maksym Planeta
@ 2011-03-28 19:44         ` H. Peter Anvin
  2011-04-01 14:08           ` Maksym Planeta
  0 siblings, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2011-03-28 19:44 UTC (permalink / raw)
  To: Maksym Planeta
  Cc: Ingo Molnar, mingo, kernel-janitors, namhyung, linux-kernel,
	Thomas Gleixner, Jan Beulich

On 03/28/2011 12:33 PM, Maksym Planeta wrote:
> 
> Here is disassembly of patched get_order() with "inline" from
> "kernel/kexec.c":
> 
>      a6c:       48 8b 5d c8             mov    -0x38(%rbp),%rbx
>      a70:       e8 0b fd ff ff          callq  780 <get_order.clone.7>
> 
> 0000000000000780 <get_order.clone.7>:
>      780:       55                      push   %rbp
>      781:       b8 01 00 00 00          mov    $0x1,%eax
>      786:       48 89 e5                mov    %rsp,%rbp
>      789:       c9                      leaveq 
>      78a:       c3                      retq   
> 
> My version of gcc is gcc (Debian 4.5.2-4) 4.5.2, probably I should
> upgrade my gcc version for better inline expansions.
> 

With what options?

	-hpa

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-27  8:45 [PATCH v2] x86: page: get_order() optimization Maksym Planeta
  2011-03-27 11:33 ` Ingo Molnar
@ 2011-03-28 19:47 ` H. Peter Anvin
  2011-03-29  7:27   ` Ingo Molnar
  1 sibling, 1 reply; 10+ messages in thread
From: H. Peter Anvin @ 2011-03-28 19:47 UTC (permalink / raw)
  To: Maksym Planeta; +Cc: mingo, kernel-janitors, namhyung, linux-kernel

On 03/27/2011 01:45 AM, Maksym Planeta wrote:
> For x86 architecture get_order function can be optimized due to
> assembler instruction bsr.
> 
> This is second version of patch where for constants gcc precompute the
> result.
> 
> Signed-off-by: Maksym Planeta <mcsim.planeta@gmail.com>

gcc 4.x has an intrinsic, __builtin_clz(), which does the opposite of
the bsr instruction; specifically:

	__builtin_clz(x) ^ 31

... generates a bsrl instruction if x is variable.  This tends to
generate much better code than any assembly hacks.

	-hpa

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-28 19:47 ` H. Peter Anvin
@ 2011-03-29  7:27   ` Ingo Molnar
  0 siblings, 0 replies; 10+ messages in thread
From: Ingo Molnar @ 2011-03-29  7:27 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Maksym Planeta, mingo, kernel-janitors, namhyung, linux-kernel


* H. Peter Anvin <hpa@zytor.com> wrote:

> On 03/27/2011 01:45 AM, Maksym Planeta wrote:
> > For x86 architecture get_order function can be optimized due to
> > assembler instruction bsr.
> > 
> > This is second version of patch where for constants gcc precompute the
> > result.
> > 
> > Signed-off-by: Maksym Planeta <mcsim.planeta@gmail.com>
> 
> gcc 4.x has an intrinsic, __builtin_clz(), which does the opposite of
> the bsr instruction; specifically:
> 
> 	__builtin_clz(x) ^ 31
> 
> ... generates a bsrl instruction if x is variable.  This tends to
> generate much better code than any assembly hacks.

Indeed, that should work better and should be tried - and it can probably 
propagate the flags result sensibly (which GCC's asm() cannot, unfortunately).

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86: page: get_order() optimization
  2011-03-28 19:44         ` H. Peter Anvin
@ 2011-04-01 14:08           ` Maksym Planeta
  0 siblings, 0 replies; 10+ messages in thread
From: Maksym Planeta @ 2011-04-01 14:08 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, mingo, kernel-janitors, namhyung, linux-kernel,
	Thomas Gleixner, Jan Beulich

On Mon, 28/03/2011 at 12:44 -0700, H. Peter Anvin wrote:
> With what options?
> 
> 	-hpa

$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.5.2/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 4.5.2-4'
--with-bugurl=file:///usr/share/doc/gcc-4.5/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr
--program-suffix=-4.5 --enable-shared --enable-multiarch
--enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib
--without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.5 --libdir=/usr/lib
--enable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --enable-plugin --enable-gold
--enable-ld=default --with-plugin-ld=ld.gold --enable-objc-gc
--with-arch-32=i586 --with-tune=generic --enable-checking=release
--build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.5.2 (Debian 4.5.2-4) 

for compiling kernel I've used: make defconfig && make -j3
-- 
Thanks,

Maksym Planeta


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-04-01 19:17 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-03-27  8:45 [PATCH v2] x86: page: get_order() optimization Maksym Planeta
2011-03-27 11:33 ` Ingo Molnar
2011-03-27 16:22   ` Peter Hüwe
2011-03-27 17:15   ` Maksym Planeta
2011-03-28  5:08     ` Ingo Molnar
2011-03-28 19:33       ` Maksym Planeta
2011-03-28 19:44         ` H. Peter Anvin
2011-04-01 14:08           ` Maksym Planeta
2011-03-28 19:47 ` H. Peter Anvin
2011-03-29  7:27   ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox