From: Alen Stojanov <astojanov@inf.ethz.ch>
To: linux-perf-users@vger.kernel.org
Subject: Some troubles with perf and measuring flops
Date: Thu, 6 Mar 2014 01:55:06 +0100 [thread overview]
Message-ID: <5317C76A.4050103@inf.ethz.ch> (raw)
[-- Attachment #1: Type: text/plain, Size: 1677 bytes --]
Dear Linux Perf Users Community,
I noticed some inconsistencies with the perf tool. I would like to
determine whether I am doing something wrong, or whether there are
problem in the perf tool. Here is the problem:
I would like to obtain flops on a simple matrix-to-matrix multiplication
algorithm. The code is available in the attachment as mmmtest.c. To
obtain flops, I run the perf tool using raw counters. When I try to
obtain flops for matrices having sizes bellow 150x150, I obtain accurate
results. Example (anticipated flops: 100 * 100 * 100 * 2 = 2'000'000):
perf stat -e r538010 ./mmmtest 100
Performance counter stats for './mmmtest 100':
2,078,775 r538010
0.003889544 seconds time elapsed
However, whenever I try to run matrices of bigger size, the reported
flops are not even close to the flops that I am supposed to obtain
(anticipated results: 600 * 600 * 600 * 2 = 432'000'000):
perf stat -e r538010 ./mmmtest 600
Performance counter stats for './mmmtest 600':
2,348,148,851 r538010
0.955511968 seconds time elapsed
To give you more info to replicate the problem, I provide you with the
following:
CPU: Intel(R) Xeon(R) CPU E5-2643 0 @ 3.30GHz, 8 cores
Linux Kernel: 3.11.0-12-generic
GCC Version: gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)
Monitored events: FP_COMP_OPS_EXE:SSE_SCALAR_DOUBLE - Raw event:
0x538010 (converted using libpfm4)
I have compiled the mmmtest.c using gcc -O3 -march=corei7-avx -o mmmtest
mmmtest.c. You can also find mmmtest.s asm version in the attachment.
Do you know why does this happens ? How can I instruct perf to obtain
accurate results ?
Greetings,
Alen
[-- Attachment #2: mmmtest.c --]
[-- Type: text/plain, Size: 479 bytes --]
#include <stdlib.h>
int m, n, k;
double *A, *B, *C;
void compute() {
int i,j,h;
for(i = 0; i < m; ++i) {
for(j = 0; j < n; ++j) {
for(h = 0; h < k; ++h) {
C[i*n+j] += A[i*k+h] * B[h*n+j];
}
}
}
}
int main(int argc, char **argv)
{
m = atoi(argv[1]); n = m; k = m;
A = (double *) malloc (m * k * sizeof(double));
B = (double *) malloc (k * n * sizeof(double));
C = (double *) malloc (m * n * sizeof(double));
compute ();
free(A);
free(B);
free(C);
}
[-- Attachment #3: mmmtest.s --]
[-- Type: text/plain, Size: 2423 bytes --]
.file "mmmtest.c"
.text
.p2align 4,,15
.globl compute
.type compute, @function
compute:
.LFB14:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
movl m(%rip), %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
testl %r12d, %r12d
jle .L9
movl n(%rip), %ebp
xorl %ebx, %ebx
movl k(%rip), %esi
movq B(%rip), %r15
movq A(%rip), %rdi
movq C(%rip), %r11
leal -1(%rbp), %eax
movslq %ebp, %r8
leaq 8(,%rax,8), %r13
movslq %esi, %r14
salq $3, %r8
salq $3, %r14
.L3:
testl %ebp, %ebp
jle .L5
leaq 0(%r13,%r11), %r10
movq %r15, %r9
movq %r11, %rcx
.p2align 4,,10
.p2align 3
.L8:
testl %esi, %esi
jle .L6
vmovsd (%rcx), %xmm0
movq %r9, %rdx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L7:
vmovsd (%rdi,%rax,8), %xmm1
addq $1, %rax
vmulsd (%rdx), %xmm1, %xmm1
addq %r8, %rdx
cmpl %eax, %esi
vaddsd %xmm1, %xmm0, %xmm0
vmovsd %xmm0, (%rcx)
jg .L7
.L6:
addq $8, %rcx
addq $8, %r9
cmpq %r10, %rcx
jne .L8
.L5:
addl $1, %ebx
addq %r14, %rdi
addq %r8, %r11
cmpl %r12d, %ebx
jne .L3
.L9:
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE14:
.size compute, .-compute
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB15:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $10, %edx
movq 8(%rsi), %rdi
xorl %esi, %esi
call strtol
movl %eax, m(%rip)
movl %eax, n(%rip)
movl %eax, k(%rip)
imull %eax, %eax
movslq %eax, %rbx
salq $3, %rbx
movq %rbx, %rdi
call malloc
movq %rbx, %rdi
movq %rax, A(%rip)
call malloc
movq %rbx, %rdi
movq %rax, B(%rip)
call malloc
movq %rax, C(%rip)
xorl %eax, %eax
call compute
movq A(%rip), %rdi
call free
movq B(%rip), %rdi
call free
movq C(%rip), %rdi
call free
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE15:
.size main, .-main
.comm C,8,8
.comm B,8,8
.comm A,8,8
.comm k,4,4
.comm n,4,4
.comm m,4,4
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu8) 4.8.1"
.section .note.GNU-stack,"",@progbits
next reply other threads:[~2014-03-06 0:56 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-06 0:55 Alen Stojanov [this message]
2014-03-06 1:40 ` Some troubles with perf and measuring flops Vince Weaver
2014-03-06 1:53 ` Alen Stojanov
2014-03-06 18:25 ` Vince Weaver
2014-03-06 19:41 ` Alen Stojanov
2014-03-11 23:53 ` Alen Stojanov
2014-03-13 20:17 ` Vince Weaver
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5317C76A.4050103@inf.ethz.ch \
--to=astojanov@inf.ethz.ch \
--cc=linux-perf-users@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).