Question about MPC7410/MPC7455 Altivec and performance.

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* Question about MPC7410/MPC7455 Altivec and performance.
@ 2003-02-14  3:25 Arun Dharankar
  2003-02-16 17:57 ` Arun Dharankar
  0 siblings, 1 reply; 2+ messages in thread
From: Arun Dharankar @ 2003-02-14  3:25 UTC (permalink / raw)
  To: linuxppc-embedded


Greetings!

On a PowerMAC G4 (7455) a test C code as shown towards the
end of this file was tried: with standard gcc, and other with
gcc with Altivec enabled (and C code preprocessed with a
Altivec preprocessor).

The Altivec/vectorized code functions better than non-vectorized
by about 40%.


The same binaries (statically linked) were tried on a MPC7410
based board. The performance of the vectorized program was
observed to be 18% slower than the non-vectorized code.



The board is MPC7410 with 8260 in companion mode (core disabled),
and the Linux kernel has been Altivec enabled (the program anyway
will not work if Altivec is disabled in the Linux kernel).



There is one change I have made to the Linux kernel, which can
be described as follows. I dont see how it can affect the Altivec,
but mentioning it here - just in case I am missing something.

The memory controller is MPC8260, and does not recognize TLBIE
transaction type (0x18) as a special case. The Linux kernel code
performaing the TLBIEs currently provided the virtual/effective
address whose TLBE needs invalidation. To work around this, I
modify address passed to tlbie so that only bits 14 to 19 remain the
same as the original address, and other bits are zero'd (essentially,
the address is guaranteed to fall in the physical memory address
range, and the memory controller responds).

Anyway, this seems to work quite well under different combinations
of non-Altivec/non-vectorized load conditions.


The Linux kernel version is 2.4.20, and GCC is 2.95.2 (with patch
to support for "-fvec" option, availabel at altivec.org).


Any ideas why 7410 performance would degrade as described above?
Or how this could be debugged?


Best regards,
-Arun.

-------------------------------------------------------------
int
main(int ac, char *av[]) {

        float a[99], b[99], x;
        int i, j, n = atoi(av[1]);

        for ( i=0; i < n; i++ )
                for(j=0; j<99; j++)
                        x += a[j]*b[j];

        return 0;
}
--------------------------------------------------------------
 int  main( int ac, char *av[] )
 {
    float a[99], b[99], x;
    int i, j, n = atoi(av[1]);
    for ( i=0; i < n; i++  )
    {
       if ( (((int )&a[0] | (int )&b[0]) & 15) != 0 )
       {
          {
             {
                int j1, j2, j3, j4, j5, j6, j7;
                vector float a1v, b1v, x1v, r2v;
                vector float x2v = (vector float )(0);
                vector float r6v = (vector float )(0);
                vector float r1v = (vector float )(0.);
                vector float a9v, a10v, b9v, b10v;
                vector float r5v = (vector float )(0);
                vector float a7v, a8v, b7v, b8v;
                vector float r4v = (vector float )(0);
                vector float a5v, a6v, b5v, b6v;
                vector float r3v = (vector float )(0);
                vector float a2v, a3v;
                vector unsigned char a4v = vec_lvsl(0, &a[0]);
                vector float b2v, b3v;
                vector unsigned char b4v = vec_lvsl(0, &b[0]);
                static vector unsigned long j1v[3] =  { (
                vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
                } ;
                vector float r7v;
                vector signed short k1v = (vector signed short )(0, 0, 0,
                0, 0, 0, 1, 0);
                vec_mtvscr( k1v );
                *((float *)&x2v) = x;
                x1v = vec_splat(x2v, 0);
                a2v = vec_ld(0, &a[0]);
                b2v = vec_ld(0, &b[0]);
                for ( j1 = 0; j1 < (99 - 4 * 4) + 1; j1 += 4 * 4 )
                {
                   j3 = j1 * sizeof(int );
                   j2 = j3 + 4 * sizeof(int );
                   a3v = vec_ld(j2, &a[0]);
                   b3v = vec_ld(j2, &b[0]);
                   a5v = vec_ld(j2 + 16, &a[0]);
                   b5v = vec_ld(j2 + 16, &b[0]);
                   a7v = vec_ld(j2 + 32, &a[0]);
                   b7v = vec_ld(j2 + 32, &b[0]);
                   a1v = vec_perm(a2v, a3v, a4v);
                   a2v = vec_ld(j2 + 48, &a[0]);
                   b1v = vec_perm(b2v, b3v, b4v);
                   b2v = vec_ld(j2 + 48, &b[0]);
                   r1v = vec_madd(a1v, b1v, r1v);
                   a6v = vec_perm(a3v, a5v, a4v);
                   b6v = vec_perm(b3v, b5v, b4v);
                   r3v = vec_madd(a6v, b6v, r3v);
                   a8v = vec_perm(a5v, a7v, a4v);
                   b8v = vec_perm(b5v, b7v, b4v);
                   r4v = vec_madd(a8v, b8v, r4v);
                   a10v = vec_perm(a7v, a2v, a4v);
                   b10v = vec_perm(b7v, b2v, b4v);
                   r5v = vec_madd(a10v, b10v, r5v);
                }
                if ( j1 )
                {
                   r1v = vec_add(r1v, r3v);
                   r1v = vec_add(r1v, r4v);
                   r1v = vec_add(r1v, r5v);
                }
                j3 = j1 * sizeof(int );
                j2 = j3 + 4 * sizeof(int );
                a3v = vec_ld(j2, &a[0]);
                a1v = vec_perm(a2v, a3v, a4v);
                b3v = vec_ld(j2, &b[0]);
                b1v = vec_perm(b2v, b3v, b4v);
                r7v = vec_sel(a1v, r6v, j1v[3-1]);
                r1v = vec_madd(r7v, b1v, r1v);
                r2v = vec_sld(r1v, r1v, 8);
                r1v = vec_add(r1v, r2v);
                r2v = vec_sld(r1v, r1v, 4);
                r1v = vec_add(r1v, r2v);
                r1v = vec_add(r1v, x1v);
                vec_ste(r1v, 0, &x);
             }
          }
       }
       else
       {
          {
             {
                int j8, j9, j10, j11, j12, j13, j14;
                vector float a11v, b11v, x3v, r9v;
                vector float x4v = (vector float )(0);
                vector float r13v = (vector float )(0);
                vector float r8v = (vector float )(0.);
                vector float a14v, b14v;
                vector float r12v = (vector float )(0);
                vector float a13v, b13v;
                vector float r11v = (vector float )(0);
                vector float a12v, b12v;
                vector float r10v = (vector float )(0);
                static vector unsigned long j2v[3] =  { (
                vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
                } ;
                vector float r14v;
                vector signed short k2v = (vector signed short )(0, 0, 0,
                0, 0, 0, 1, 0);
                vec_mtvscr( k2v );
                *((float *)&x4v) = x;
                x3v = vec_splat(x4v, 0);
                for ( j8 = 0; j8 < (99 - 4 * 4) + 1; j8 += 4 * 4 )
                {
                   j10 = j8 * sizeof(int );
                   j9 = j10;
                   a11v = vec_ld(j10, &a[0]);
                   b11v = vec_ld(j10, &b[0]);
                   a12v = vec_ld(j10 + 16, &a[0]);
                   b12v = vec_ld(j10 + 16, &b[0]);
                   a13v = vec_ld(j10 + 32, &a[0]);
                   b13v = vec_ld(j10 + 32, &b[0]);
                   a14v = vec_ld(j10 + 48, &a[0]);
                   b14v = vec_ld(j10 + 48, &b[0]);
                   r8v = vec_madd(a11v, b11v, r8v);
                   r10v = vec_madd(a12v, b12v, r10v);
                   r11v = vec_madd(a13v, b13v, r11v);
                   r12v = vec_madd(a14v, b14v, r12v);
                }
                if ( j8 )
                {
                   r8v = vec_add(r8v, r10v);
                   r8v = vec_add(r8v, r11v);
                   r8v = vec_add(r8v, r12v);
                }
                j10 = j8 * sizeof(int );
                j9 = j10;
                a11v = vec_ld(j10, &a[0]);
                b11v = vec_ld(j10, &b[0]);
                r14v = vec_sel(a11v, r13v, j2v[3-1]);
                r8v = vec_madd(r14v, b11v, r8v);
                r9v = vec_sld(r8v, r8v, 8);
                r8v = vec_add(r8v, r9v);
                r9v = vec_sld(r8v, r8v, 4);
                r8v = vec_add(r8v, r9v);
                r8v = vec_add(r8v, x3v);
                vec_ste(r8v, 0, &x);
             }
          }
       }
    }
    return 0;
 }

--------------------------------------------------------------

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: Question about MPC7410/MPC7455 Altivec and performance.
  2003-02-14  3:25 Question about MPC7410/MPC7455 Altivec and performance Arun Dharankar
@ 2003-02-16 17:57 ` Arun Dharankar
  0 siblings, 0 replies; 2+ messages in thread
From: Arun Dharankar @ 2003-02-16 17:57 UTC (permalink / raw)
  To: linuxppc-embedded


Greetings... this turns out to be a vectorizer issue. If
the "float" arrays (the sample/test C code) are moved out
into global/extern scope, the program functions almost 100%
better than the non-vectorized code (on 745x and 7410).


Best regards,
-Arun.


On Thursday 13 February 2003 10:25 pm, I wrote:
> Greetings!
>
> On a PowerMAC G4 (7455) a test C code as shown towards the
> end of this file was tried: with standard gcc, and other with
> gcc with Altivec enabled (and C code preprocessed with a
> Altivec preprocessor).
>
> The Altivec/vectorized code functions better than non-vectorized
> by about 40%.
>
>
> The same binaries (statically linked) were tried on a MPC7410
> based board. The performance of the vectorized program was
> observed to be 18% slower than the non-vectorized code.
>
>
>
> The board is MPC7410 with 8260 in companion mode (core disabled),
> and the Linux kernel has been Altivec enabled (the program anyway
> will not work if Altivec is disabled in the Linux kernel).
>
>
>
> There is one change I have made to the Linux kernel, which can
> be described as follows. I dont see how it can affect the Altivec,
> but mentioning it here - just in case I am missing something.
>
> The memory controller is MPC8260, and does not recognize TLBIE
> transaction type (0x18) as a special case. The Linux kernel code
> performaing the TLBIEs currently provided the virtual/effective
> address whose TLBE needs invalidation. To work around this, I
> modify address passed to tlbie so that only bits 14 to 19 remain the
> same as the original address, and other bits are zero'd (essentially,
> the address is guaranteed to fall in the physical memory address
> range, and the memory controller responds).
>
> Anyway, this seems to work quite well under different combinations
> of non-Altivec/non-vectorized load conditions.
>
>
> The Linux kernel version is 2.4.20, and GCC is 2.95.2 (with patch
> to support for "-fvec" option, availabel at altivec.org).
>
>
> Any ideas why 7410 performance would degrade as described above?
> Or how this could be debugged?
>
>
> Best regards,
> -Arun.
>
> -------------------------------------------------------------
> int
> main(int ac, char *av[]) {
>
>         float a[99], b[99], x;
>         int i, j, n = atoi(av[1]);
>
>         for ( i=0; i < n; i++ )
>                 for(j=0; j<99; j++)
>                         x += a[j]*b[j];
>
>         return 0;
> }
> --------------------------------------------------------------
>  int  main( int ac, char *av[] )
>  {
>     float a[99], b[99], x;
>     int i, j, n = atoi(av[1]);
>     for ( i=0; i < n; i++  )
>     {
>        if ( (((int )&a[0] | (int )&b[0]) & 15) != 0 )
>        {
>           {
>              {
>                 int j1, j2, j3, j4, j5, j6, j7;
>                 vector float a1v, b1v, x1v, r2v;
>                 vector float x2v = (vector float )(0);
>                 vector float r6v = (vector float )(0);
>                 vector float r1v = (vector float )(0.);
>                 vector float a9v, a10v, b9v, b10v;
>                 vector float r5v = (vector float )(0);
>                 vector float a7v, a8v, b7v, b8v;
>                 vector float r4v = (vector float )(0);
>                 vector float a5v, a6v, b5v, b6v;
>                 vector float r3v = (vector float )(0);
>                 vector float a2v, a3v;
>                 vector unsigned char a4v = vec_lvsl(0, &a[0]);
>                 vector float b2v, b3v;
>                 vector unsigned char b4v = vec_lvsl(0, &b[0]);
>                 static vector unsigned long j1v[3] =  { (
>                 vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
>                 0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
>                 0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
>                 } ;
>                 vector float r7v;
>                 vector signed short k1v = (vector signed short )(0, 0, 0,
>                 0, 0, 0, 1, 0);
>                 vec_mtvscr( k1v );
>                 *((float *)&x2v) = x;
>                 x1v = vec_splat(x2v, 0);
>                 a2v = vec_ld(0, &a[0]);
>                 b2v = vec_ld(0, &b[0]);
>                 for ( j1 = 0; j1 < (99 - 4 * 4) + 1; j1 += 4 * 4 )
>                 {
>                    j3 = j1 * sizeof(int );
>                    j2 = j3 + 4 * sizeof(int );
>                    a3v = vec_ld(j2, &a[0]);
>                    b3v = vec_ld(j2, &b[0]);
>                    a5v = vec_ld(j2 + 16, &a[0]);
>                    b5v = vec_ld(j2 + 16, &b[0]);
>                    a7v = vec_ld(j2 + 32, &a[0]);
>                    b7v = vec_ld(j2 + 32, &b[0]);
>                    a1v = vec_perm(a2v, a3v, a4v);
>                    a2v = vec_ld(j2 + 48, &a[0]);
>                    b1v = vec_perm(b2v, b3v, b4v);
>                    b2v = vec_ld(j2 + 48, &b[0]);
>                    r1v = vec_madd(a1v, b1v, r1v);
>                    a6v = vec_perm(a3v, a5v, a4v);
>                    b6v = vec_perm(b3v, b5v, b4v);
>                    r3v = vec_madd(a6v, b6v, r3v);
>                    a8v = vec_perm(a5v, a7v, a4v);
>                    b8v = vec_perm(b5v, b7v, b4v);
>                    r4v = vec_madd(a8v, b8v, r4v);
>                    a10v = vec_perm(a7v, a2v, a4v);
>                    b10v = vec_perm(b7v, b2v, b4v);
>                    r5v = vec_madd(a10v, b10v, r5v);
>                 }
>                 if ( j1 )
>                 {
>                    r1v = vec_add(r1v, r3v);
>                    r1v = vec_add(r1v, r4v);
>                    r1v = vec_add(r1v, r5v);
>                 }
>                 j3 = j1 * sizeof(int );
>                 j2 = j3 + 4 * sizeof(int );
>                 a3v = vec_ld(j2, &a[0]);
>                 a1v = vec_perm(a2v, a3v, a4v);
>                 b3v = vec_ld(j2, &b[0]);
>                 b1v = vec_perm(b2v, b3v, b4v);
>                 r7v = vec_sel(a1v, r6v, j1v[3-1]);
>                 r1v = vec_madd(r7v, b1v, r1v);
>                 r2v = vec_sld(r1v, r1v, 8);
>                 r1v = vec_add(r1v, r2v);
>                 r2v = vec_sld(r1v, r1v, 4);
>                 r1v = vec_add(r1v, r2v);
>                 r1v = vec_add(r1v, x1v);
>                 vec_ste(r1v, 0, &x);
>              }
>           }
>        }
>        else
>        {
>           {
>              {
>                 int j8, j9, j10, j11, j12, j13, j14;
>                 vector float a11v, b11v, x3v, r9v;
>                 vector float x4v = (vector float )(0);
>                 vector float r13v = (vector float )(0);
>                 vector float r8v = (vector float )(0.);
>                 vector float a14v, b14v;
>                 vector float r12v = (vector float )(0);
>                 vector float a13v, b13v;
>                 vector float r11v = (vector float )(0);
>                 vector float a12v, b12v;
>                 vector float r10v = (vector float )(0);
>                 static vector unsigned long j2v[3] =  { (
>                 vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
>                 0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
>                 0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
>                 } ;
>                 vector float r14v;
>                 vector signed short k2v = (vector signed short )(0, 0, 0,
>                 0, 0, 0, 1, 0);
>                 vec_mtvscr( k2v );
>                 *((float *)&x4v) = x;
>                 x3v = vec_splat(x4v, 0);
>                 for ( j8 = 0; j8 < (99 - 4 * 4) + 1; j8 += 4 * 4 )
>                 {
>                    j10 = j8 * sizeof(int );
>                    j9 = j10;
>                    a11v = vec_ld(j10, &a[0]);
>                    b11v = vec_ld(j10, &b[0]);
>                    a12v = vec_ld(j10 + 16, &a[0]);
>                    b12v = vec_ld(j10 + 16, &b[0]);
>                    a13v = vec_ld(j10 + 32, &a[0]);
>                    b13v = vec_ld(j10 + 32, &b[0]);
>                    a14v = vec_ld(j10 + 48, &a[0]);
>                    b14v = vec_ld(j10 + 48, &b[0]);
>                    r8v = vec_madd(a11v, b11v, r8v);
>                    r10v = vec_madd(a12v, b12v, r10v);
>                    r11v = vec_madd(a13v, b13v, r11v);
>                    r12v = vec_madd(a14v, b14v, r12v);
>                 }
>                 if ( j8 )
>                 {
>                    r8v = vec_add(r8v, r10v);
>                    r8v = vec_add(r8v, r11v);
>                    r8v = vec_add(r8v, r12v);
>                 }
>                 j10 = j8 * sizeof(int );
>                 j9 = j10;
>                 a11v = vec_ld(j10, &a[0]);
>                 b11v = vec_ld(j10, &b[0]);
>                 r14v = vec_sel(a11v, r13v, j2v[3-1]);
>                 r8v = vec_madd(r14v, b11v, r8v);
>                 r9v = vec_sld(r8v, r8v, 8);
>                 r8v = vec_add(r8v, r9v);
>                 r9v = vec_sld(r8v, r8v, 4);
>                 r8v = vec_add(r8v, r9v);
>                 r8v = vec_add(r8v, x3v);
>                 vec_ste(r8v, 0, &x);
>              }
>           }
>        }
>     }
>     return 0;
>  }
>
> --------------------------------------------------------------
>
>


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2003-02-16 17:57 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-02-14  3:25 Question about MPC7410/MPC7455 Altivec and performance Arun Dharankar
2003-02-16 17:57 ` Arun Dharankar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).