All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alen Stojanov <astojanov@inf.ethz.ch>
To: linux-perf-users@vger.kernel.org
Subject: Some troubles with perf and measuring flops
Date: Thu, 6 Mar 2014 01:55:06 +0100	[thread overview]
Message-ID: <5317C76A.4050103@inf.ethz.ch> (raw)

[-- Attachment #1: Type: text/plain, Size: 1677 bytes --]

Dear Linux Perf Users Community,

I noticed some inconsistencies with the perf tool. I would like to 
determine whether I am doing something wrong, or whether there are 
problem in the perf tool. Here is the problem:

I would like to obtain flops on a simple matrix-to-matrix multiplication 
algorithm. The code is available in the attachment as mmmtest.c. To 
obtain flops, I run the perf tool using raw counters. When I try to 
obtain flops for matrices having sizes bellow 150x150, I obtain accurate 
results. Example (anticipated flops: 100 * 100 * 100 * 2 = 2'000'000):

perf stat -e r538010 ./mmmtest 100

  Performance counter stats for './mmmtest 100':

          2,078,775 r538010

        0.003889544 seconds time elapsed


However, whenever I try to run matrices of bigger size, the reported 
flops are not even close to the flops that I am supposed to obtain 
(anticipated results: 600 * 600 * 600 * 2 = 432'000'000):

perf stat -e r538010 ./mmmtest 600

  Performance counter stats for './mmmtest 600':

      2,348,148,851 r538010

        0.955511968 seconds time elapsed


To give you more info to replicate the problem, I provide you with the 
following:

CPU: Intel(R) Xeon(R) CPU E5-2643 0 @ 3.30GHz, 8 cores
Linux Kernel: 3.11.0-12-generic
GCC Version: gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)
Monitored events: FP_COMP_OPS_EXE:SSE_SCALAR_DOUBLE - Raw event: 
0x538010 (converted using libpfm4)

I have compiled the mmmtest.c using gcc -O3 -march=corei7-avx -o mmmtest 
mmmtest.c. You can also find mmmtest.s asm version in the attachment.

Do you know why does this happens ? How can I instruct perf to obtain 
accurate results ?

Greetings,
Alen

[-- Attachment #2: mmmtest.c --]
[-- Type: text/plain, Size: 479 bytes --]

#include <stdlib.h>

int m, n, k;
double *A, *B, *C;

void compute() {
	int i,j,h;
	for(i = 0; i < m; ++i) {
		for(j = 0; j < n; ++j) {
			for(h = 0; h < k; ++h) {
				C[i*n+j] += A[i*k+h] * B[h*n+j];
			}
		}
	}
}

int main(int argc, char **argv)
{
	m = atoi(argv[1]); n = m; k = m;

	A = (double *) malloc (m * k * sizeof(double));
	B = (double *) malloc (k * n * sizeof(double));
	C = (double *) malloc (m * n * sizeof(double));

	compute ();

	free(A);
	free(B);
	free(C);
}

[-- Attachment #3: mmmtest.s --]
[-- Type: text/plain, Size: 2423 bytes --]

	.file	"mmmtest.c"
	.text
	.p2align 4,,15
	.globl	compute
	.type	compute, @function
compute:
.LFB14:
	.cfi_startproc
	pushq	%r15
	.cfi_def_cfa_offset 16
	.cfi_offset 15, -16
	pushq	%r14
	.cfi_def_cfa_offset 24
	.cfi_offset 14, -24
	pushq	%r13
	.cfi_def_cfa_offset 32
	.cfi_offset 13, -32
	pushq	%r12
	.cfi_def_cfa_offset 40
	.cfi_offset 12, -40
	movl	m(%rip), %r12d
	pushq	%rbp
	.cfi_def_cfa_offset 48
	.cfi_offset 6, -48
	pushq	%rbx
	.cfi_def_cfa_offset 56
	.cfi_offset 3, -56
	testl	%r12d, %r12d
	jle	.L9
	movl	n(%rip), %ebp
	xorl	%ebx, %ebx
	movl	k(%rip), %esi
	movq	B(%rip), %r15
	movq	A(%rip), %rdi
	movq	C(%rip), %r11
	leal	-1(%rbp), %eax
	movslq	%ebp, %r8
	leaq	8(,%rax,8), %r13
	movslq	%esi, %r14
	salq	$3, %r8
	salq	$3, %r14
.L3:
	testl	%ebp, %ebp
	jle	.L5
	leaq	0(%r13,%r11), %r10
	movq	%r15, %r9
	movq	%r11, %rcx
	.p2align 4,,10
	.p2align 3
.L8:
	testl	%esi, %esi
	jle	.L6
	vmovsd	(%rcx), %xmm0
	movq	%r9, %rdx
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L7:
	vmovsd	(%rdi,%rax,8), %xmm1
	addq	$1, %rax
	vmulsd	(%rdx), %xmm1, %xmm1
	addq	%r8, %rdx
	cmpl	%eax, %esi
	vaddsd	%xmm1, %xmm0, %xmm0
	vmovsd	%xmm0, (%rcx)
	jg	.L7
.L6:
	addq	$8, %rcx
	addq	$8, %r9
	cmpq	%r10, %rcx
	jne	.L8
.L5:
	addl	$1, %ebx
	addq	%r14, %rdi
	addq	%r8, %r11
	cmpl	%r12d, %ebx
	jne	.L3
.L9:
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%rbp
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE14:
	.size	compute, .-compute
	.section	.text.startup,"ax",@progbits
	.p2align 4,,15
	.globl	main
	.type	main, @function
main:
.LFB15:
	.cfi_startproc
	pushq	%rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	movl	$10, %edx
	movq	8(%rsi), %rdi
	xorl	%esi, %esi
	call	strtol
	movl	%eax, m(%rip)
	movl	%eax, n(%rip)
	movl	%eax, k(%rip)
	imull	%eax, %eax
	movslq	%eax, %rbx
	salq	$3, %rbx
	movq	%rbx, %rdi
	call	malloc
	movq	%rbx, %rdi
	movq	%rax, A(%rip)
	call	malloc
	movq	%rbx, %rdi
	movq	%rax, B(%rip)
	call	malloc
	movq	%rax, C(%rip)
	xorl	%eax, %eax
	call	compute
	movq	A(%rip), %rdi
	call	free
	movq	B(%rip), %rdi
	call	free
	movq	C(%rip), %rdi
	call	free
	popq	%rbx
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE15:
	.size	main, .-main
	.comm	C,8,8
	.comm	B,8,8
	.comm	A,8,8
	.comm	k,4,4
	.comm	n,4,4
	.comm	m,4,4
	.ident	"GCC: (Ubuntu/Linaro 4.8.1-10ubuntu8) 4.8.1"
	.section	.note.GNU-stack,"",@progbits

             reply	other threads:[~2014-03-06  0:56 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-06  0:55 Alen Stojanov [this message]
2014-03-06  1:40 ` Some troubles with perf and measuring flops Vince Weaver
2014-03-06  1:53   ` Alen Stojanov
2014-03-06 18:25     ` Vince Weaver
2014-03-06 19:41       ` Alen Stojanov
2014-03-11 23:53         ` Alen Stojanov
2014-03-13 20:17           ` Vince Weaver

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5317C76A.4050103@inf.ethz.ch \
    --to=astojanov@inf.ethz.ch \
    --cc=linux-perf-users@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.