linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Optimising IS_ERR_OR_NULL
@ 2014-04-05 14:43 Matthew Wilcox
  2014-04-05 15:48 ` Alexander Holler
  0 siblings, 1 reply; 2+ messages in thread
From: Matthew Wilcox @ 2014-04-05 14:43 UTC (permalink / raw)
  To: linux-kernel


(4 days too late for April Fools ... oh well :-)

I don't like the look of IS_ERR_OR_NULL.  It does two tests when (due to
the bit patterns used to represent errors and NULL pointers) it could
use just one:

#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)

static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
	return !ptr || IS_ERR_VALUE((unsigned long)ptr);
}

It needs some performance testing to be sure that it's a win, but I'm
essentially suggesting this:

+++ b/include/linux/err.h
@@ -34,10 +34,8 @@ static inline long __must_check IS_ERR(__force const void *pt
        return IS_ERR_VALUE((unsigned long)ptr);
 }
 
-static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr)
-{
-       return !ptr || IS_ERR_VALUE((unsigned long)ptr);
-}
+#define IS_ERR_OR_NULL(ptr) \
+       unlikely(((unsigned long)PTR_ERR(ptr) + MAX_ERRNO) <= MAX_ERRNO)
 
 /**
  * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type

(deliberately whitespace-damaged to ensure it doesn't get applied
without thought).

Comments, before I go off and run some perf tests?

Here's how I got to this point:

Let's take kern_unmount() as our example caller (for no particular reason
... first one I came across.  It is blessedly short though):

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR_OR_NULL(mnt)) {
                real_mount(mnt)->mnt_ns = NULL;
                synchronize_rcu();      /* yecchhh... */
                mntput(mnt);
        }
}

Here's the assembly for it:

0000000000001180 <kern_unmount>:
    1180:       48 85 ff                test   %rdi,%rdi
    1183:       53                      push   %rbx
    1184:       48 89 fb                mov    %rdi,%rbx
    1187:       74 27                   je     11b0 <kern_unmount+0x30>
    1189:       48 81 ff 00 f0 ff ff    cmp    $0xfffffffffffff000,%rdi
    1190:       77 1e                   ja     11b0 <kern_unmount+0x30>
    1192:       48 c7 87 c0 00 00 00    movq   $0x0,0xc0(%rdi)
    1199:       00 00 00 00 
    119d:       e8 00 00 00 00          callq  11a2 <kern_unmount+0x22>
                        119e: R_X86_64_PC32     synchronize_sched-0x4
    11a2:       48 89 df                mov    %rbx,%rdi
    11a5:       5b                      pop    %rbx
    11a6:       e9 00 00 00 00          jmpq   11ab <kern_unmount+0x2b>
                        11a7: R_X86_64_PC32     mntput-0x4
    11ab:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    11b0:       5b                      pop    %rbx
    11b1:       c3                      retq   
    11b2:       66 66 66 66 66 2e 0f    data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
    11b9:       1f 84 00 00 00 00 00 

So we do two tests and jumps; first at 1180 and then at 1189, in either
case we jump forward to the end.  Now here's what happens when we optimise
IS_ERR_OR_NULL to make use of the fact that 0 is actually contiguous
with the range of errors:

 static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr)
 {
-       return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+       return ((unsigned long)ptr + MAX_ERRNO) <= MAX_ERRNO;
 }

0000000000001180 <kern_unmount>:
    1180:       48 8d 87 ff 0f 00 00    lea    0xfff(%rdi),%rax
    1187:       48 3d ff 0f 00 00       cmp    $0xfff,%rax
    118d:       77 09                   ja     1198 <kern_unmount+0x18>
    118f:       f3 c3                   repz retq 
    1191:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
    1198:       48 83 ec 08             sub    $0x8,%rsp
    119c:       48 c7 87 c0 00 00 00    movq   $0x0,0xc0(%rdi)
    11a3:       00 00 00 00 
    11a7:       48 89 3c 24             mov    %rdi,(%rsp)
    11ab:       e8 00 00 00 00          callq  11b0 <kern_unmount+0x30>
                        11ac: R_X86_64_PC32     synchronize_sched-0x4
    11b0:       48 8b 3c 24             mov    (%rsp),%rdi
    11b4:       48 83 c4 08             add    $0x8,%rsp
    11b8:       e9 00 00 00 00          jmpq   11bd <kern_unmount+0x3d>
                        11b9: R_X86_64_PC32     mntput-0x4
    11bd:       0f 1f 00                nopl   (%rax)

It's fewer instructions, and only one compare, which is nice.  But gcc
now thinks that IS_ERR_OR_NULL is the likely case and has an early return
embedded in the function (118f).  So let's quickly fix that up:

-       if (!IS_ERR_OR_NULL(mnt)) {
+       if (!unlikely(IS_ERR_OR_NULL(mnt))) {

0000000000001180 <kern_unmount>:
    1180:       48 8d 87 ff 0f 00 00    lea    0xfff(%rdi),%rax
    1187:       53                      push   %rbx
    1188:       48 89 fb                mov    %rdi,%rbx
    118b:       48 3d ff 0f 00 00       cmp    $0xfff,%rax
    1191:       76 19                   jbe    11ac <kern_unmount+0x2c>
    1193:       48 c7 87 c0 00 00 00    movq   $0x0,0xc0(%rdi)
    119a:       00 00 00 00 
    119e:       e8 00 00 00 00          callq  11a3 <kern_unmount+0x23>
                        119f: R_X86_64_PC32     synchronize_sched-0x4
    11a3:       48 89 df                mov    %rbx,%rdi
    11a6:       5b                      pop    %rbx
    11a7:       e9 00 00 00 00          jmpq   11ac <kern_unmount+0x2c>
                        11a8: R_X86_64_PC32     mntput-0x4
    11ac:       5b                      pop    %rbx
    11ad:       c3                      retq   
    11ae:       66 90                   xchg   %ax,%ax

Now that's a genuine improvement; we saved one instruction (lea, cmp,
jbe vs test, je, cmp, ja), and due to various alignment / padding / etc.
we ended up saving 4 bytes on the length of the function which turns
into 16 bytes due to function alignment.


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [RFC] Optimising IS_ERR_OR_NULL
  2014-04-05 14:43 [RFC] Optimising IS_ERR_OR_NULL Matthew Wilcox
@ 2014-04-05 15:48 ` Alexander Holler
  0 siblings, 0 replies; 2+ messages in thread
From: Alexander Holler @ 2014-04-05 15:48 UTC (permalink / raw)
  To: Matthew Wilcox, linux-kernel

Am 05.04.2014 16:43, schrieb Matthew Wilcox:
>
> (4 days too late for April Fools ... oh well :-)
>
> I don't like the look of IS_ERR_OR_NULL.  It does two tests when (due to
> the bit patterns used to represent errors and NULL pointers) it could
> use just one:
>
> #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
>
> static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr)
> {
> 	return !ptr || IS_ERR_VALUE((unsigned long)ptr);
> }
>
> It needs some performance testing to be sure that it's a win, but I'm
> essentially suggesting this:
>
> +++ b/include/linux/err.h
> @@ -34,10 +34,8 @@ static inline long __must_check IS_ERR(__force const void *pt
>          return IS_ERR_VALUE((unsigned long)ptr);
>   }
>
> -static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr)
> -{
> -       return !ptr || IS_ERR_VALUE((unsigned long)ptr);
> -}
> +#define IS_ERR_OR_NULL(ptr) \
> +       unlikely(((unsigned long)PTR_ERR(ptr) + MAX_ERRNO) <= MAX_ERRNO)
>
>   /**
>    * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
>
> (deliberately whitespace-damaged to ensure it doesn't get applied
> without thought).
>
> Comments, before I go off and run some perf tests?
>

(... x86 asm)

> Now that's a genuine improvement; we saved one instruction (lea, cmp,
> jbe vs test, je, cmp, ja), and due to various alignment / padding / etc.
> we ended up saving 4 bytes on the length of the function which turns
> into 16 bytes due to function alignment.

A first comment, if that really will be changed, please leave to old 
function as comment for the lazy reader. The new one is pretty ugly and 
needs a turned on brain to understand (besides that the new one discards 
the __must_check, but I would assume that no one uses IS_ERR_OR_NULL() 
without checking the return value).

As I just was a bit bored, here's what happens on ARM so that others 
don't have to compile it on ARM:

armv5 old:

000011e0 <kern_unmount>:
     11e0:       e92d4010        push    {r4, lr}
     11e4:       e2504000        subs    r4, r0, #0
     11e8:       0a000007        beq     120c <kern_unmount+0x2c>
     11ec:       e3740a01        cmn     r4, #4096       ; 0x1000
     11f0:       8a000005        bhi     120c <kern_unmount+0x2c>
     11f4:       e3a03000        mov     r3, #0
     11f8:       e5843064        str     r3, [r4, #100]  ; 0x64
     11fc:       ebfffffe        bl      0 <synchronize_rcu>
     1200:       e1a00004        mov     r0, r4
     1204:       e8bd4010        pop     {r4, lr}
     1208:       eafffffe        b       c78 <mntput>
     120c:       e8bd8010        pop     {r4, pc}

armv5 new:

00000c98 <kern_unmount>:
      c98:       e2803eff        add     r3, r0, #4080   ; 0xff0
      c9c:       e283300f        add     r3, r3, #15
      ca0:       e3530a01        cmp     r3, #4096       ; 0x1000
      ca4:       e92d4010        push    {r4, lr}
      ca8:       e1a04000        mov     r4, r0
      cac:       3a000005        bcc     cc8 <kern_unmount+0x30>
      cb0:       e3a03000        mov     r3, #0
      cb4:       e5803064        str     r3, [r0, #100]  ; 0x64
      cb8:       ebfffffe        bl      0 <synchronize_rcu>
      cbc:       e1a00004        mov     r0, r4
      cc0:       e8bd4010        pop     {r4, lr}
      cc4:       eafffffe        b       c78 <mntput>
      cc8:       e8bd8010        pop     {r4, pc}

And armv7 old:

00000e60 <kern_unmount>:
      e60:       e92d4010        push    {r4, lr}
      e64:       e2504000        subs    r4, r0, #0
      e68:       08bd8010        popeq   {r4, pc}
      e6c:       e3740a01        cmn     r4, #4096       ; 0x1000
      e70:       88bd8010        pophi   {r4, pc}
      e74:       e3a03000        mov     r3, #0
      e78:       e5843064        str     r3, [r4, #100]  ; 0x64
      e7c:       ebfffffe        bl      0 <synchronize_sched>
      e80:       e1a00004        mov     r0, r4
      e84:       e8bd4010        pop     {r4, lr}
      e88:       eafffffe        b       a3c <mntput>

armv7 new:

00000a5c <kern_unmount>:
      a5c:       e2803eff        add     r3, r0, #4080   ; 0xff0
      a60:       e283300f        add     r3, r3, #15
      a64:       e3530a01        cmp     r3, #4096       ; 0x1000
      a68:       e92d4010        push    {r4, lr}
      a6c:       e1a04000        mov     r4, r0
      a70:       38bd8010        popcc   {r4, pc}
      a74:       e3a03000        mov     r3, #0
      a78:       e5803064        str     r3, [r0, #100]  ; 0x64
      a7c:       ebfffffe        bl      0 <synchronize_sched>
      a80:       e1a00004        mov     r0, r4
      a84:       e8bd4010        pop     {r4, lr}
      a88:       eafffffe        b       a3c <mntput>

Unfortunately I'm bad in interpreting assembler (I prefer higher 
languages like C++), therefor I don't even try it and leave further 
comments on the ARM assembler to other people. ;)

Regards,

Alexander Holler

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2014-04-05 15:49 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-04-05 14:43 [RFC] Optimising IS_ERR_OR_NULL Matthew Wilcox
2014-04-05 15:48 ` Alexander Holler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).