Re: Why Git is so fast - Steven Noonan

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Steven Noonan <steven@uplinklabs.net>
To: Kjetil Barvik <barvik@broadpark.no>
Cc: "Shawn O. Pearce" <spearce@spearce.org>, git@vger.kernel.org
Subject: Re: Why Git is so fast
Date: Thu, 30 Apr 2009 17:23:57 -0700	[thread overview]
Message-ID: <f488382f0904301723i37ef03d9w4e93848e603ed28b@mail.gmail.com> (raw)
In-Reply-To: <8663gllt88.fsf@broadpark.no>

On Thu, Apr 30, 2009 at 2:36 PM, Kjetil Barvik <barvik@broadpark.no> wrote:
> * "Shawn O. Pearce" <spearce@spearce.org> writes:
> |>      4) The "static inline void hashcpy(....)" in cache.h could then
> |>         maybe be written like this:
> |
> | Its already done as "memcpy(a, b, 20)" which most compilers will
> | inline and probably reduce to 5 word moves anyway.  That's why
> | hashcpy() itself is inline.
>
>  But would the compiler be able to trust that the hashcpy() is always
>  called with correct word alignment on variables a and b?
>
>  I made a test and compiled git with:
>
>    make USE_NSEC=1 CFLAGS="-march=core2 -mtune=core2 -O2 -g2 -fno-stack-protector" clean all
>
>  compiler: gcc (Gentoo 4.3.3-r2 p1.1, pie-10.1.5) 4.3.3
>  CPU: Intel(R) Core(TM)2 CPU T7200 @ 2.00GHz GenuineIntel
>
>  Then used gdb to get the following:
>
> (gdb) disassemble write_sha1_file
> Dump of assembler code for function write_sha1_file:
> 0x080e3830 <write_sha1_file+0>: push   %ebp
> 0x080e3831 <write_sha1_file+1>: mov    %esp,%ebp
> 0x080e3833 <write_sha1_file+3>: sub    $0x58,%esp
> 0x080e3836 <write_sha1_file+6>: lea    -0x10(%ebp),%eax
> 0x080e3839 <write_sha1_file+9>: mov    %ebx,-0xc(%ebp)
> 0x080e383c <write_sha1_file+12>:        mov    %esi,-0x8(%ebp)
> 0x080e383f <write_sha1_file+15>:        mov    %edi,-0x4(%ebp)
> 0x080e3842 <write_sha1_file+18>:        mov    0x14(%ebp),%ebx
> 0x080e3845 <write_sha1_file+21>:        mov    %eax,0x8(%esp)
> 0x080e3849 <write_sha1_file+25>:        lea    -0x44(%ebp),%edi
> 0x080e384c <write_sha1_file+28>:        lea    -0x24(%ebp),%esi
> 0x080e384f <write_sha1_file+31>:        mov    %edi,0x4(%esp)
> 0x080e3853 <write_sha1_file+35>:        mov    %esi,(%esp)
> 0x080e3856 <write_sha1_file+38>:        mov    0x10(%ebp),%ecx
> 0x080e3859 <write_sha1_file+41>:        mov    0xc(%ebp),%edx
> 0x080e385c <write_sha1_file+44>:        mov    0x8(%ebp),%eax
> 0x080e385f <write_sha1_file+47>:        call   0x80e0350 <write_sha1_file_prepare>
> 0x080e3864 <write_sha1_file+52>:        test   %ebx,%ebx
> 0x080e3866 <write_sha1_file+54>:        je     0x80e3885 <write_sha1_file+85>
>
> 0x080e3868 <write_sha1_file+56>:        mov    -0x24(%ebp),%eax
> 0x080e386b <write_sha1_file+59>:        mov    %eax,(%ebx)
> 0x080e386d <write_sha1_file+61>:        mov    -0x20(%ebp),%eax
> 0x080e3870 <write_sha1_file+64>:        mov    %eax,0x4(%ebx)
> 0x080e3873 <write_sha1_file+67>:        mov    -0x1c(%ebp),%eax
> 0x080e3876 <write_sha1_file+70>:        mov    %eax,0x8(%ebx)
> 0x080e3879 <write_sha1_file+73>:        mov    -0x18(%ebp),%eax
> 0x080e387c <write_sha1_file+76>:        mov    %eax,0xc(%ebx)
> 0x080e387f <write_sha1_file+79>:        mov    -0x14(%ebp),%eax
> 0x080e3882 <write_sha1_file+82>:        mov    %eax,0x10(%ebx)
>
>  I admit that I am not particular familar with intel machine
>  instructions, but I guess that the above 10 mov instructions is the
>  result for the compiled inline hashcpy() in the write_sha1_file()
>  function in sha1_file.c
>
>  Question: would it be possible for the compiler to compile it down to
>  just 5 mov instructions if we had used unsigned 32 bits type?  Or is
>  this the best we can reasonable hope for inside the write_sha1_file()
>  function?
>
>  I checked 3 other output of "disassemble function_foo", and it seems
>  that those 3 functions I checked got 10 mov instructions for the
>  inline hashcpy(), as far as I can tell.
>
> 0x080e3885 <write_sha1_file+85>:        mov    %esi,(%esp)
> 0x080e3888 <write_sha1_file+88>:        call   0x80e3800 <has_sha1_file>
> 0x080e388d <write_sha1_file+93>:        xor    %edx,%edx
> 0x080e388f <write_sha1_file+95>:        test   %eax,%eax
> 0x080e3891 <write_sha1_file+97>:        jne    0x80e38b6 <write_sha1_file+134>
> 0x080e3893 <write_sha1_file+99>:        mov    0xc(%ebp),%eax
> 0x080e3896 <write_sha1_file+102>:       mov    %edi,%edx
> 0x080e3898 <write_sha1_file+104>:       mov    %eax,0x4(%esp)
> 0x080e389c <write_sha1_file+108>:       mov    -0x10(%ebp),%ecx
> 0x080e389f <write_sha1_file+111>:       mov    0x8(%ebp),%eax
> 0x080e38a2 <write_sha1_file+114>:       movl   $0x0,0x8(%esp)
> 0x080e38aa <write_sha1_file+122>:       mov    %eax,(%esp)
> 0x080e38ad <write_sha1_file+125>:       mov    %esi,%eax
> 0x080e38af <write_sha1_file+127>:       call   0x80e1e40 <write_loose_object>
> 0x080e38b4 <write_sha1_file+132>:       mov    %eax,%edx
> 0x080e38b6 <write_sha1_file+134>:       mov    %edx,%eax
> 0x080e38b8 <write_sha1_file+136>:       mov    -0xc(%ebp),%ebx
> 0x080e38bb <write_sha1_file+139>:       mov    -0x8(%ebp),%esi
> 0x080e38be <write_sha1_file+142>:       mov    -0x4(%ebp),%edi
> 0x080e38c1 <write_sha1_file+145>:       leave
> 0x080e38c2 <write_sha1_file+146>:       ret
> End of assembler dump.
> (gdb)
>
>  So, maybe the compiler is doing the right thing after all?
>

Well, I just tested this with GCC myself. I used this segment of code:

        #include <memory.h>
        void hashcpy(unsigned char *sha_dst, const unsigned char *sha_src)
        {
                memcpy(sha_dst, sha_src, 20);
        }

I compiled using Apple's GCC 4.0.1 (note that GCC 4.3 and 4.4 vanilla
yield the same code) with these parameters to get Intel assembly:
        gcc -O2 -arch i386 -march=pentium3 -mtune=pentium3
-fomit-frame-pointer -fno-strict-aliasing -S test.c
and these parameters to get the equivalent PowerPC code:
        gcc -O2 -mcpu=G5 -arch ppc -fomit-frame-pointer
-fno-strict-aliasing -S test.c

Intel code:
        .text
        .align 4,0x90
.globl _hashcpy
_hashcpy:
        subl    $12, %esp
        movl    20(%esp), %edx
        movl    16(%esp), %ecx
        movl    (%edx), %eax
        movl    %eax, (%ecx)
        movl    4(%edx), %eax
        movl    %eax, 4(%ecx)
        movl    8(%edx), %eax
        movl    %eax, 8(%ecx)
        movl    12(%edx), %eax
        movl    %eax, 12(%ecx)
        movl    16(%edx), %eax
        movl    %eax, 16(%ecx)
        addl    $12, %esp
        ret
        .subsections_via_symbols


and the PowerPC code:

        .section __TEXT,__text,regular,pure_instructions
        .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
        .machine ppc970
        .text
        .align 2
        .p2align 4,,15
        .globl _hashcpy
_hashcpy:
        lwz r0,0(r4)
        lwz r2,4(r4)
        lwz r9,8(r4)
        lwz r11,12(r4)
        stw r0,0(r3)
        stw r2,4(r3)
        stw r9,8(r3)
        stw r11,12(r3)
        lwz r0,16(r4)
        stw r0,16(r3)
        blr
        .subsections_via_symbols


So it does look like GCC does what it should and it inlines the memcpy.

A bit off topic, but the results are rather interesting to me, and I
think I see a weakness in how GCC is doing this on Intel. Someone
please correct me if I'm wrong, but the PowerPC code seems much better
because it can yield very high instruction-level parallelism. It does
5 loads and then 5 stores, using 4 registers for temporary storage and
2 registers for pointers.

I realize the Intel x86 architecture is quite constrained in that it
has so few general purpose registers, but there has to be better code
than what GCC emitted above. It seems like the processor would stall
because of the quantity of sequential inter-dependent instructions
that can't be done in parallel (mov to memory that depends on a mov to
eax, etc).

I suppose the code might not be stalling if it's using the maximum
number of registers and doing as many memory accesses that it can per
clock, but based on known details about the architecture, does it seem
to be doing that?

- Steven

next prev parent reply	other threads:[~2009-05-01  0:24 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-04-27  8:55 Eric Sink's blog - notes on git, dscms and a "whole product" approach Martin Langhoff
2009-04-28 11:24 ` Cross-Platform Version Control (was: Eric Sink's blog - notes on git, dscms and a "whole product" approach) Jakub Narebski
2009-04-28 21:00   ` Robin Rosenberg
2009-04-29  6:55   ` Martin Langhoff
2009-04-29  7:21     ` Jeff King
2009-04-29 20:05       ` Markus Heidelberg
2009-04-29  7:52     ` Cross-Platform Version Control Jakub Narebski
2009-04-29  8:25       ` Martin Langhoff
2009-04-28 18:16 ` Eric Sink's blog - notes on git, dscms and a "whole product" approach Jakub Narebski
2009-04-29  7:54   ` Sitaram Chamarty
2009-04-30 12:17   ` Why Git is so fast (was: Re: Eric Sink's blog - notes on git, dscms and a "whole product" approach) Jakub Narebski
2009-04-30 12:56     ` Michael Witten
2009-04-30 15:28       ` Why Git is so fast Jakub Narebski
2009-04-30 18:52         ` Shawn O. Pearce
2009-04-30 20:36           ` Kjetil Barvik
2009-04-30 20:40             ` Shawn O. Pearce
2009-04-30 21:36               ` Kjetil Barvik
2009-05-01  0:23                 ` Steven Noonan [this message]
2009-05-01  1:25                   ` James Pickens
2009-05-01  9:19                   ` Kjetil Barvik
2009-05-01  9:34                     ` Mike Hommey
2009-05-01  9:42                       ` Kjetil Barvik
2009-05-01 17:42                 ` Tony Finch
2009-05-01  5:24             ` Dmitry Potapov
2009-05-01  9:42               ` Mike Hommey
2009-05-01 10:46                 ` Dmitry Potapov
2009-04-30 18:43       ` Why Git is so fast (was: Re: Eric Sink's blog - notes on git, dscms and a "whole product" approach) Shawn O. Pearce
2009-04-30 14:22     ` Jeff King
2009-05-01 18:43       ` Linus Torvalds
2009-05-01 19:08         ` Jeff King
2009-05-01 19:13           ` david
2009-05-01 19:32             ` Nicolas Pitre
2009-05-01 21:17           ` Daniel Barkalow
2009-05-01 21:37           ` Linus Torvalds
2009-05-01 22:11             ` david
2009-04-30 18:56     ` Nicolas Pitre
2009-04-30 19:16       ` Alex Riesen
2009-05-04  8:01         ` Why Git is so fast Andreas Ericsson
2009-04-30 19:33       ` Jakub Narebski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f488382f0904301723i37ef03d9w4e93848e603ed28b@mail.gmail.com \
    --to=steven@uplinklabs.net \
    --cc=barvik@broadpark.no \
    --cc=git@vger.kernel.org \
    --cc=spearce@spearce.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).