From: Nicolas Bock <nicolasbock@gmail.com>
To: Frederic Marmond <fmarmond@gmail.com>
Cc: linux-assembly@vger.kernel.org
Subject: Re: 4x4 single-precision matrix product with SSE
Date: Sun, 13 Mar 2011 14:23:31 -0600 [thread overview]
Message-ID: <4D7D27C3.5050404@gmail.com> (raw)
In-Reply-To: <AANLkTimCWmanFU19admtg5q18HvCOxrdjm+9XWFT-0Zm@mail.gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 1775 bytes --]
I have attached a short test project that demonstrates what I am doing.
I time this simply with the time function, i.e.
$ time ./mul_SSE 100000000
real 0m1.037s
user 0m1.036s
sys 0m0.001s
$ time ./mul_SSE4_1 100000000
real 0m2.006s
user 0m2.003s
sys 0m0.002s
I assume that I have prepared the A matrix for SSE a little bit by
"dilating" the elements into A = { A11, A11, A11, A11, A12, A12, ... },
while for SSE4.1 I am calling the multiply with the transpose of B.
As these matrices are really small, they should be completely in L1, so
the movaps operation should have pretty low latency. Since the SSE
version uses 4 times more data for A than the SSE4.1 version, I am
surprised that given the larger number of data movements for the SSE
version it still beats the SSE4.1 version. But maybe I am just not
coding this very intelligently.
Any suggestions would be very welcome,
Thanks already, nick
On 03/12/11 01:20, Frederic Marmond wrote:
> Hello Nicolas,
>
> Yes, it's the right place :)
> could you please paste your code as well as your benchmark context ?
>
> Fred
>
> 2011/3/11 Nicolas Bock <nicolasbock@gmail.com
> <mailto:nicolasbock@gmail.com>>
>
> Hello list,
>
> I am writing an assembly function that multiplies 2 4x4 single precision
> matrices. I wrote 2 versions, one using SSE the other using SSE4.1. What
> surprised me is that the SSE4.1 version fails to beat the SSE version,
> it is in fact slightly slower.
>
> Is this the right place to ask for help? If anyone is interested I can
> post some code which would maybe clarify the situation a bit.
>
> If this is not the right place, please ignore me...
>
> nick
>
>
[-- Attachment #1.2: Makefile --]
[-- Type: text/plain, Size: 416 bytes --]
#CFLAGS = -O0 -g
CFLAGS = -O2 -ffast-math
all : mul_SSE mul_SSE4_1
mul_SSE : main_SSE.o matrix_multiply_SSE.o
gcc -o $@ $^
mul_SSE4_1 : main_SSE4_1.o matrix_multiply_SSE4_1.o
gcc -o $@ $^
.PHONY: clean
clean:
rm -f *.o
main_SSE.o : main.c
gcc $(CFLAGS) -DSSE -c -o $@ $^
main_SSE4_1.o : main.c
gcc $(CFLAGS) -DSSE4_1 -c -o $@ $^
%.o : %.c
gcc $(CFLAGS) -c -o $@ $^
%.o : %.S
gcc $(CFLAGS) -c -o $@ $^
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.3: main.c --]
[-- Type: text/x-csrc; name="main.c", Size: 1756 bytes --]
#include <stdio.h>
#include <stdlib.h>
#define RANDOM_MATRIX
//#define PRINT_DEBUG
#if defined(SSE)
void
matrix_multiply_SSE (const unsigned int N, float *A, float *B, float *C);
#elif defined(SSE4_1)
void
matrix_multiply_SSE4_1 (const unsigned int N, float *A, float *B, float *C);
#endif
int
main (int argc, char **argv)
{
float __attribute__ ((aligned (64))) A[4][4];
float __attribute__ ((aligned (64))) A_dilated[4][4][4];
float __attribute__ ((aligned (64))) B[4][4];
float __attribute__ ((aligned (64))) B_transpose[4][4];
float __attribute__ ((aligned (64))) C[4][4];
short i, j;
unsigned int max_N = 1;
/* Parse command line. */
if (argc == 2)
{
max_N = strtol(argv[1], NULL, 10);
}
/* Fill matrix with some random stuff. */
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++)
{
#ifndef RANDOM_MATRIX
A[i][j] = i*4+j;
B[i][j] = i*4+j;
C[i][j] = i*4+j;
#else
A[i][j] = rand()/(float) RAND_MAX;
B[i][j] = rand()/(float) RAND_MAX;
C[i][j] = rand()/(float) RAND_MAX;
#endif
B_transpose[j][i] = B[i][j];
A_dilated[i][j][0] = A[i][j];
A_dilated[i][j][1] = A[i][j];
A_dilated[i][j][2] = A[i][j];
A_dilated[i][j][3] = A[i][j];
}
}
#ifdef SSE
matrix_multiply_SSE(max_N, (float*) &A_dilated[0][0], (float*) &B[0][0], (float*) &C[0][0]);
#elif defined(SSE4_1)
matrix_multiply_SSE4_1(max_N, (float*) &A[0][0], (float*) &B_transpose[0][0], (float*) &C[0][0]);
#endif
#ifdef PRINT_DEBUG
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++)
{
//printf(" %i", (int) C[i][j]);
printf(" %f", C[i][j]);
}
printf("\n");
}
#endif
return 0;
}
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.4: matrix_multiply_SSE.S --]
[-- Type: text/x-asm; name="matrix_multiply_SSE.S", Size: 2001 bytes --]
# C API:
#
# void
# matrix_multiply_SSE (const unsigned int N, float *A, float *B, float *C);
#define N %rdi
#define A %rsi
#define B %rdx
#define C %rcx
#define i %rax
.text
.align 256
.global matrix_multiply_SSE
.type matrix_multiply_SSE, @function
matrix_multiply_SSE:
push i
xor i, i
test N, N
jbe end_loop
start_loop:
movaps 0x00(C), %xmm0
movaps 0x10(C), %xmm1
movaps 0x20(C), %xmm2
movaps 0x30(C), %xmm3
movaps 0x00(B), %xmm4
movaps 0x10(B), %xmm5
movaps 0x20(B), %xmm6
movaps 0x30(B), %xmm7
# Calculate C(1,:).
movaps 0x000(A), %xmm8
movaps 0x010(A), %xmm9
movaps 0x020(A), %xmm10
mulps %xmm4, %xmm8
mulps %xmm5, %xmm9
addps %xmm8, %xmm0
movaps 0x030(A), %xmm11
mulps %xmm6, %xmm10
addps %xmm9, %xmm0
movaps 0x040(A), %xmm12
mulps %xmm7, %xmm11
addps %xmm10, %xmm0
movaps 0x050(A), %xmm13
mulps %xmm4, %xmm12
addps %xmm11, %xmm0
movaps 0x060(A), %xmm14
mulps %xmm5, %xmm13
addps %xmm12, %xmm1
movaps 0x070(A), %xmm15
mulps %xmm6, %xmm14
addps %xmm13, %xmm1
movaps 0x080(A), %xmm8
mulps %xmm7, %xmm15
addps %xmm14, %xmm1
movaps 0x090(A), %xmm9
mulps %xmm4, %xmm8
addps %xmm15, %xmm1
movaps 0x0a0(A), %xmm10
mulps %xmm5, %xmm9
addps %xmm8, %xmm2
movaps 0x0b0(A), %xmm11
mulps %xmm6, %xmm10
addps %xmm9, %xmm2
movaps 0x0c0(A), %xmm12
mulps %xmm7, %xmm11
addps %xmm10, %xmm2
movaps 0x0d0(A), %xmm13
mulps %xmm4, %xmm12
addps %xmm11, %xmm2
movaps 0x0e0(A), %xmm14
mulps %xmm5, %xmm13
addps %xmm12, %xmm3
movaps 0x0f0(A), %xmm15
mulps %xmm6, %xmm14
addps %xmm13, %xmm3
mulps %xmm7, %xmm15
addps %xmm14, %xmm3
addps %xmm15, %xmm3
# Write C back.
movaps %xmm0, 0x00(C)
movaps %xmm1, 0x10(C)
movaps %xmm2, 0x20(C)
movaps %xmm3, 0x30(C)
inc i
cmp N, i
jb start_loop
end_loop:
pop i
ret
.size matrix_multiply_SSE, .-matrix_multiply_SSE
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.5: matrix_multiply_SSE4_1.S --]
[-- Type: text/x-asm; name="matrix_multiply_SSE4_1.S", Size: 2380 bytes --]
# C API:
#
# void
# matrix_multiply_SSE4_1 (const unsigned int N, float *A, float *B, float *C);
#define N %rdi
#define A %rsi
#define B %rdx
#define C %rcx
#define i %rax
.text
.align 256
.global matrix_multiply_SSE4_1
.type matrix_multiply_SSE4_1, @function
matrix_multiply_SSE4_1:
push i
xor i, i
test N, N
jbe end_loop
start_loop:
movaps 0x00(C), %xmm0
movaps 0x10(C), %xmm1
movaps 0x20(C), %xmm2
movaps 0x30(C), %xmm3
movaps 0x00(B), %xmm4
movaps 0x10(B), %xmm5
movaps 0x20(B), %xmm6
movaps 0x30(B), %xmm7
movaps 0x00(A), %xmm8
movaps 0x10(A), %xmm9
# Calculate C(1,:).
movaps %xmm4, %xmm10
dpps $0xf1, %xmm8, %xmm10
movaps %xmm5, %xmm11
dpps $0xf2, %xmm8, %xmm11
movaps %xmm6, %xmm12
dpps $0xf4, %xmm8, %xmm12
movaps %xmm7, %xmm13
dpps $0xf8, %xmm8, %xmm13
blendps $0x01, %xmm10, %xmm11
blendps $0x03, %xmm11, %xmm12
blendps $0x07, %xmm12, %xmm13
addps %xmm13, %xmm0
movaps 0x20(A), %xmm8
# Calculate C(2,:).
movaps %xmm4, %xmm10
dpps $0xf1, %xmm9, %xmm10
movaps %xmm5, %xmm11
dpps $0xf2, %xmm9, %xmm11
movaps %xmm6, %xmm12
dpps $0xf4, %xmm9, %xmm12
movaps %xmm7, %xmm13
dpps $0xf8, %xmm9, %xmm13
blendps $0x01, %xmm10, %xmm11
blendps $0x03, %xmm11, %xmm12
blendps $0x07, %xmm12, %xmm13
addps %xmm13, %xmm1
movaps 0x30(A), %xmm9
# Calculate C(3,:).
movaps %xmm4, %xmm10
dpps $0xf1, %xmm8, %xmm10
movaps %xmm5, %xmm11
dpps $0xf2, %xmm8, %xmm11
movaps %xmm6, %xmm12
dpps $0xf4, %xmm8, %xmm12
movaps %xmm7, %xmm13
dpps $0xf8, %xmm8, %xmm13
blendps $0x01, %xmm10, %xmm11
blendps $0x03, %xmm11, %xmm12
blendps $0x07, %xmm12, %xmm13
addps %xmm13, %xmm2
# Calculate C(4,:).
movaps %xmm4, %xmm10
dpps $0xf1, %xmm9, %xmm10
movaps %xmm5, %xmm11
dpps $0xf2, %xmm9, %xmm11
movaps %xmm6, %xmm12
dpps $0xf4, %xmm9, %xmm12
movaps %xmm7, %xmm13
dpps $0xf8, %xmm9, %xmm13
blendps $0x01, %xmm10, %xmm11
blendps $0x03, %xmm11, %xmm12
blendps $0x07, %xmm12, %xmm13
addps %xmm13, %xmm3
# Write C back.
movaps %xmm0, 0x00(C)
movaps %xmm1, 0x10(C)
movaps %xmm2, 0x20(C)
movaps %xmm3, 0x30(C)
inc i
cmp N, i
jb start_loop
end_loop:
pop i
ret
.size matrix_multiply_SSE4_1, .-matrix_multiply_SSE4_1
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]
next prev parent reply other threads:[~2011-03-13 20:23 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-03-11 22:49 4x4 single-precision matrix product with SSE Nicolas Bock
2011-03-12 8:32 ` Frederic Marmond
[not found] ` <AANLkTimCWmanFU19admtg5q18HvCOxrdjm+9XWFT-0Zm@mail.gmail.com>
2011-03-13 20:23 ` Nicolas Bock [this message]
[not found] ` <AANLkTim-ZqzJ+2q+u=7+yRjzTf7FQDcuu-YDN=RV0H6X@mail.gmail.com>
[not found] ` <AANLkTimny0PkR0bYBjKgaH4j=_=2aL=rt=YcDjWeQCG6@mail.gmail.com>
2011-03-14 15:43 ` Fwd: " Nicolas Bock
2012-09-05 19:13 ` Nicolas Bock
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4D7D27C3.5050404@gmail.com \
--to=nicolasbock@gmail.com \
--cc=fmarmond@gmail.com \
--cc=linux-assembly@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).