From mboxrd@z Thu Jan 1 00:00:00 1970 From: Alen Stojanov Subject: Some troubles with perf and measuring flops Date: Thu, 6 Mar 2014 01:55:06 +0100 Message-ID: <5317C76A.4050103@inf.ethz.ch> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="------------030603030201080508060603" Return-path: Received: from edge20.ethz.ch ([82.130.99.26]:4009 "EHLO edge20.ethz.ch" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757246AbaCFA4S (ORCPT ); Wed, 5 Mar 2014 19:56:18 -0500 Sender: linux-perf-users-owner@vger.kernel.org List-ID: To: linux-perf-users@vger.kernel.org --------------030603030201080508060603 Content-Type: text/plain; charset="ISO-8859-1"; format=flowed Content-Transfer-Encoding: 7bit Dear Linux Perf Users Community, I noticed some inconsistencies with the perf tool. I would like to determine whether I am doing something wrong, or whether there are problem in the perf tool. Here is the problem: I would like to obtain flops on a simple matrix-to-matrix multiplication algorithm. The code is available in the attachment as mmmtest.c. To obtain flops, I run the perf tool using raw counters. When I try to obtain flops for matrices having sizes bellow 150x150, I obtain accurate results. Example (anticipated flops: 100 * 100 * 100 * 2 = 2'000'000): perf stat -e r538010 ./mmmtest 100 Performance counter stats for './mmmtest 100': 2,078,775 r538010 0.003889544 seconds time elapsed However, whenever I try to run matrices of bigger size, the reported flops are not even close to the flops that I am supposed to obtain (anticipated results: 600 * 600 * 600 * 2 = 432'000'000): perf stat -e r538010 ./mmmtest 600 Performance counter stats for './mmmtest 600': 2,348,148,851 r538010 0.955511968 seconds time elapsed To give you more info to replicate the problem, I provide you with the following: CPU: Intel(R) Xeon(R) CPU E5-2643 0 @ 3.30GHz, 8 cores Linux Kernel: 3.11.0-12-generic GCC Version: gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8) Monitored events: FP_COMP_OPS_EXE:SSE_SCALAR_DOUBLE - Raw event: 0x538010 (converted using libpfm4) I have compiled the mmmtest.c using gcc -O3 -march=corei7-avx -o mmmtest mmmtest.c. You can also find mmmtest.s asm version in the attachment. Do you know why does this happens ? How can I instruct perf to obtain accurate results ? Greetings, Alen --------------030603030201080508060603 Content-Type: text/plain; charset="UTF-8"; x-mac-type=54455854; x-mac-creator=21526368; name="mmmtest.c" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="mmmtest.c" #include int m, n, k; double *A, *B, *C; void compute() { int i,j,h; for(i = 0; i < m; ++i) { for(j = 0; j < n; ++j) { for(h = 0; h < k; ++h) { C[i*n+j] += A[i*k+h] * B[h*n+j]; } } } } int main(int argc, char **argv) { m = atoi(argv[1]); n = m; k = m; A = (double *) malloc (m * k * sizeof(double)); B = (double *) malloc (k * n * sizeof(double)); C = (double *) malloc (m * n * sizeof(double)); compute (); free(A); free(B); free(C); } --------------030603030201080508060603 Content-Type: text/plain; charset="UTF-8"; x-mac-type=54455854; x-mac-creator=21526368; name="mmmtest.s" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="mmmtest.s" .file "mmmtest.c" .text .p2align 4,,15 .globl compute .type compute, @function compute: .LFB14: .cfi_startproc pushq %r15 .cfi_def_cfa_offset 16 .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 .cfi_offset 13, -32 pushq %r12 .cfi_def_cfa_offset 40 .cfi_offset 12, -40 movl m(%rip), %r12d pushq %rbp .cfi_def_cfa_offset 48 .cfi_offset 6, -48 pushq %rbx .cfi_def_cfa_offset 56 .cfi_offset 3, -56 testl %r12d, %r12d jle .L9 movl n(%rip), %ebp xorl %ebx, %ebx movl k(%rip), %esi movq B(%rip), %r15 movq A(%rip), %rdi movq C(%rip), %r11 leal -1(%rbp), %eax movslq %ebp, %r8 leaq 8(,%rax,8), %r13 movslq %esi, %r14 salq $3, %r8 salq $3, %r14 .L3: testl %ebp, %ebp jle .L5 leaq 0(%r13,%r11), %r10 movq %r15, %r9 movq %r11, %rcx .p2align 4,,10 .p2align 3 .L8: testl %esi, %esi jle .L6 vmovsd (%rcx), %xmm0 movq %r9, %rdx xorl %eax, %eax .p2align 4,,10 .p2align 3 .L7: vmovsd (%rdi,%rax,8), %xmm1 addq $1, %rax vmulsd (%rdx), %xmm1, %xmm1 addq %r8, %rdx cmpl %eax, %esi vaddsd %xmm1, %xmm0, %xmm0 vmovsd %xmm0, (%rcx) jg .L7 .L6: addq $8, %rcx addq $8, %r9 cmpq %r10, %rcx jne .L8 .L5: addl $1, %ebx addq %r14, %rdi addq %r8, %r11 cmpl %r12d, %ebx jne .L3 .L9: popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE14: .size compute, .-compute .section .text.startup,"ax",@progbits .p2align 4,,15 .globl main .type main, @function main: .LFB15: .cfi_startproc pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 movl $10, %edx movq 8(%rsi), %rdi xorl %esi, %esi call strtol movl %eax, m(%rip) movl %eax, n(%rip) movl %eax, k(%rip) imull %eax, %eax movslq %eax, %rbx salq $3, %rbx movq %rbx, %rdi call malloc movq %rbx, %rdi movq %rax, A(%rip) call malloc movq %rbx, %rdi movq %rax, B(%rip) call malloc movq %rax, C(%rip) xorl %eax, %eax call compute movq A(%rip), %rdi call free movq B(%rip), %rdi call free movq C(%rip), %rdi call free popq %rbx .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE15: .size main, .-main .comm C,8,8 .comm B,8,8 .comm A,8,8 .comm k,4,4 .comm n,4,4 .comm m,4,4 .ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu8) 4.8.1" .section .note.GNU-stack,"",@progbits --------------030603030201080508060603--