From: Alen Stojanov <astojanov@inf.ethz.ch>
To: linux-perf-users@vger.kernel.org
Subject: Some troubles with perf and measuring flops
Date: Thu, 6 Mar 2014 01:55:06 +0100 [thread overview]
Message-ID: <5317C76A.4050103@inf.ethz.ch> (raw)
[-- Attachment #1: Type: text/plain, Size: 1677 bytes --]
Dear Linux Perf Users Community,
I noticed some inconsistencies with the perf tool. I would like to
determine whether I am doing something wrong, or whether there are
problem in the perf tool. Here is the problem:
I would like to obtain flops on a simple matrix-to-matrix multiplication
algorithm. The code is available in the attachment as mmmtest.c. To
obtain flops, I run the perf tool using raw counters. When I try to
obtain flops for matrices having sizes bellow 150x150, I obtain accurate
results. Example (anticipated flops: 100 * 100 * 100 * 2 = 2'000'000):
perf stat -e r538010 ./mmmtest 100
Performance counter stats for './mmmtest 100':
2,078,775 r538010
0.003889544 seconds time elapsed
However, whenever I try to run matrices of bigger size, the reported
flops are not even close to the flops that I am supposed to obtain
(anticipated results: 600 * 600 * 600 * 2 = 432'000'000):
perf stat -e r538010 ./mmmtest 600
Performance counter stats for './mmmtest 600':
2,348,148,851 r538010
0.955511968 seconds time elapsed
To give you more info to replicate the problem, I provide you with the
following:
CPU: Intel(R) Xeon(R) CPU E5-2643 0 @ 3.30GHz, 8 cores
Linux Kernel: 3.11.0-12-generic
GCC Version: gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)
Monitored events: FP_COMP_OPS_EXE:SSE_SCALAR_DOUBLE - Raw event:
0x538010 (converted using libpfm4)
I have compiled the mmmtest.c using gcc -O3 -march=corei7-avx -o mmmtest
mmmtest.c. You can also find mmmtest.s asm version in the attachment.
Do you know why does this happens ? How can I instruct perf to obtain
accurate results ?
Greetings,
Alen
[-- Attachment #2: mmmtest.c --]
[-- Type: text/plain, Size: 479 bytes --]
#include <stdlib.h>
int m, n, k;
double *A, *B, *C;
void compute() {
int i,j,h;
for(i = 0; i < m; ++i) {
for(j = 0; j < n; ++j) {
for(h = 0; h < k; ++h) {
C[i*n+j] += A[i*k+h] * B[h*n+j];
}
}
}
}
int main(int argc, char **argv)
{
m = atoi(argv[1]); n = m; k = m;
A = (double *) malloc (m * k * sizeof(double));
B = (double *) malloc (k * n * sizeof(double));
C = (double *) malloc (m * n * sizeof(double));
compute ();
free(A);
free(B);
free(C);
}
[-- Attachment #3: mmmtest.s --]
[-- Type: text/plain, Size: 2423 bytes --]
.file "mmmtest.c"
.text
.p2align 4,,15
.globl compute
.type compute, @function
compute:
.LFB14:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
movl m(%rip), %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
testl %r12d, %r12d
jle .L9
movl n(%rip), %ebp
xorl %ebx, %ebx
movl k(%rip), %esi
movq B(%rip), %r15
movq A(%rip), %rdi
movq C(%rip), %r11
leal -1(%rbp), %eax
movslq %ebp, %r8
leaq 8(,%rax,8), %r13
movslq %esi, %r14
salq $3, %r8
salq $3, %r14
.L3:
testl %ebp, %ebp
jle .L5
leaq 0(%r13,%r11), %r10
movq %r15, %r9
movq %r11, %rcx
.p2align 4,,10
.p2align 3
.L8:
testl %esi, %esi
jle .L6
vmovsd (%rcx), %xmm0
movq %r9, %rdx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L7:
vmovsd (%rdi,%rax,8), %xmm1
addq $1, %rax
vmulsd (%rdx), %xmm1, %xmm1
addq %r8, %rdx
cmpl %eax, %esi
vaddsd %xmm1, %xmm0, %xmm0
vmovsd %xmm0, (%rcx)
jg .L7
.L6:
addq $8, %rcx
addq $8, %r9
cmpq %r10, %rcx
jne .L8
.L5:
addl $1, %ebx
addq %r14, %rdi
addq %r8, %r11
cmpl %r12d, %ebx
jne .L3
.L9:
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE14:
.size compute, .-compute
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB15:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $10, %edx
movq 8(%rsi), %rdi
xorl %esi, %esi
call strtol
movl %eax, m(%rip)
movl %eax, n(%rip)
movl %eax, k(%rip)
imull %eax, %eax
movslq %eax, %rbx
salq $3, %rbx
movq %rbx, %rdi
call malloc
movq %rbx, %rdi
movq %rax, A(%rip)
call malloc
movq %rbx, %rdi
movq %rax, B(%rip)
call malloc
movq %rax, C(%rip)
xorl %eax, %eax
call compute
movq A(%rip), %rdi
call free
movq B(%rip), %rdi
call free
movq C(%rip), %rdi
call free
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE15:
.size main, .-main
.comm C,8,8
.comm B,8,8
.comm A,8,8
.comm k,4,4
.comm n,4,4
.comm m,4,4
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu8) 4.8.1"
.section .note.GNU-stack,"",@progbits
next reply other threads:[~2014-03-06 0:56 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-06 0:55 Alen Stojanov [this message]
2014-03-06 1:40 ` Some troubles with perf and measuring flops Vince Weaver
2014-03-06 1:53 ` Alen Stojanov
2014-03-06 18:25 ` Vince Weaver
2014-03-06 19:41 ` Alen Stojanov
2014-03-11 23:53 ` Alen Stojanov
2014-03-13 20:17 ` Vince Weaver
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5317C76A.4050103@inf.ethz.ch \
--to=astojanov@inf.ethz.ch \
--cc=linux-perf-users@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.