public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
@ 2009-06-29  9:33 Jaswinder Singh Rajput
  2009-06-30 10:11 ` Ingo Molnar
  0 siblings, 1 reply; 9+ messages in thread
From: Jaswinder Singh Rajput @ 2009-06-29  9:33 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, Peter Zijlstra, x86 maintainers,
	LKML


 $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null

 Performance counter stats for 'ls -lR /usr/include/':

           7335  add                       (   2.00x scaled)
           8012  multiply                  (   1.99x scaled)
           5229  fpu-store                 (   2.00x scaled)
      793097355  fpu-empty                 (   2.00x scaled)
            182  fpu-busy                  (   2.00x scaled)
              6  x87                       (   2.01x scaled)
              4  mmx-3dnow                 (   2.00x scaled)
           8933  sse-sse2                  (   2.00x scaled)

    0.393548820  seconds time elapsed

 $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3

 Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':

       19583739  add                       (   2.01x scaled)
       20856051  multiply                  (   2.01x scaled)
       18669503  fpu-store                 (   2.00x scaled)
    25100224054  fpu-empty                 (   1.99x scaled)
       12540131  fpu-busy                  (   1.99x scaled)
         207228  x87                       (   1.99x scaled)
        1768418  mmx-3dnow                 (   2.00x scaled)
       42286702  sse-sse2                  (   2.01x scaled)

  302.698647617  seconds time elapsed

 $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv

 Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':

     6572682335  add                       (   2.00x scaled)
    11131555181  multiply                  (   2.00x scaled)
     1317520699  fpu-store                 (   2.00x scaled)
     9089415134  fpu-empty                 (   1.99x scaled)
     2902772713  fpu-busy                  (   2.00x scaled)
          26047  x87                       (   2.00x scaled)
    24850978532  mmx-3dnow                 (   2.00x scaled)
      262276117  sse-sse2                  (   2.01x scaled)

   96.169312358  seconds time elapsed

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/cpu/perf_counter.c |   34 ++++++++++++++++++++++++++++++
 include/linux/perf_counter.h       |   17 +++++++++++++++
 kernel/perf_counter.c              |    1 +
 tools/perf/util/parse-events.c     |   40 ++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b83474b..4417edf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
  },
 };
 
+/*
+ * Generalized hw fpu event table
+ */
+
+static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
+
 static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -481,6 +487,18 @@ static const u64 amd_hw_cache_event_ids
  },
 };
 
+static const u64 amd_hw_fpu_event_ids[] =
+{
+  [PERF_COUNT_HW_FPU_ADD]		= 0x0100, /* Dispatched FPU Add	     */
+  [PERF_COUNT_HW_FPU_MULTIPLY]		= 0x0200, /* Dispatched FPU Multiply */
+  [PERF_COUNT_HW_FPU_STORE]		= 0x0400, /* Dispatched FPU Store    */
+  [PERF_COUNT_HW_FPU_EMPTY]		= 0x0001, /* FPU Empty cycles        */
+  [PERF_COUNT_HW_FPU_BUSY]		= 0x00D7, /* Dispatch stall for FPU  */
+  [PERF_COUNT_HW_FPU_X87_INSTR]		= 0x01CB, /* Retired x87 Instructions*/
+  [PERF_COUNT_HW_FPU_MMX_3DNOW_INSTR]	= 0x02CB, /* Retired MMX & 3DNow Inst*/
+  [PERF_COUNT_HW_FPU_SSE_SSE2_INSTR]	= 0x0CCB, /* Retired SSE & SSE2 Instr*/
+};
+
 /*
  * AMD Performance Monitor K7 and later.
  */
@@ -659,6 +677,17 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
 	return 0;
 }
 
+static inline int
+set_hw_fpu_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+	if (attr->config >= PERF_COUNT_HW_FPU_MAX)
+		return -EINVAL;
+
+	hwc->config |= hw_fpu_event_ids[attr->config];
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -716,6 +745,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (attr->type == PERF_TYPE_HW_CACHE)
 		return set_ext_hw_attr(hwc, attr);
 
+	if (attr->type == PERF_TYPE_HW_FPU)
+		return set_hw_fpu_attr(hwc, attr);
+
 	if (attr->config >= x86_pmu.max_events)
 		return -EINVAL;
 	/*
@@ -1468,6 +1500,8 @@ static int amd_pmu_init(void)
 	/* Events are common for all AMDs */
 	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
 	       sizeof(hw_cache_event_ids));
+	memcpy(hw_fpu_event_ids, amd_hw_fpu_event_ids,
+	       sizeof(hw_fpu_event_ids));
 
 	return 0;
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3078e23..89b3370 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,7 @@ enum perf_type_id {
 	PERF_TYPE_TRACEPOINT			= 2,
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
+	PERF_TYPE_HW_FPU			= 5,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
@@ -89,6 +90,22 @@ enum perf_hw_cache_op_result_id {
 };
 
 /*
+ * Generalized hardware FPU counters:
+ */
+enum perf_hw_fpu_id {
+	PERF_COUNT_HW_FPU_ADD			= 0,
+	PERF_COUNT_HW_FPU_MULTIPLY		= 1,
+	PERF_COUNT_HW_FPU_STORE			= 2,
+	PERF_COUNT_HW_FPU_EMPTY			= 3,
+	PERF_COUNT_HW_FPU_BUSY			= 4,
+	PERF_COUNT_HW_FPU_X87_INSTR		= 5,
+	PERF_COUNT_HW_FPU_MMX_3DNOW_INSTR	= 6,
+	PERF_COUNT_HW_FPU_SSE_SSE2_INSTR	= 7,
+
+	PERF_COUNT_HW_FPU_MAX,			/* non-ABI */
+};
+
+/*
  * Special "software" counters provided by the kernel, even if the hardware
  * does not support performance counters. These counters measure various
  * physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 66ab1e9..c40132f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3788,6 +3788,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	case PERF_TYPE_RAW:
 	case PERF_TYPE_HARDWARE:
 	case PERF_TYPE_HW_CACHE:
+	case PERF_TYPE_HW_FPU:
 		pmu = hw_perf_counter_init(counter);
 		break;
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4d042f1..4d03061 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,19 @@ static struct event_symbol event_symbols[] = {
   { CSW(CPU_MIGRATIONS),	"cpu-migrations",	"migrations"	},
 };
 
+#define CHFPU(x) .type = PERF_TYPE_HW_FPU, .config = PERF_COUNT_HW_FPU_##x
+
+static struct event_symbol fpu_event_symbols[] = {
+  { CHFPU(ADD),			"add",			"addition"	},
+  { CHFPU(MULTIPLY),		"multiply",		"multiplication"},
+  { CHFPU(STORE),		"fpu-store",		""		},
+  { CHFPU(EMPTY),		"fpu-empty",		""		},
+  { CHFPU(BUSY),		"fpu-busy",		""		},
+  { CHFPU(X87_INSTR),		"x87",			""		},
+  { CHFPU(MMX_3DNOW_INSTR),	"mmx-3dnow",		""		},
+  { CHFPU(SSE_SSE2_INSTR),	"sse-sse2",		"sse"		},
+};
+
 #define __PERF_COUNTER_FIELD(config, name) \
 	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
 
@@ -172,6 +185,11 @@ char *event_name(int counter)
 		return event_cache_name(cache_type, cache_op, cache_result);
 	}
 
+	case PERF_TYPE_HW_FPU:
+		if (config < PERF_COUNT_HW_FPU_MAX)
+			return fpu_event_symbols[config].symbol;
+		return "unknown-fpu";
+
 	case PERF_TYPE_SOFTWARE:
 		if (config < PERF_COUNT_SW_MAX)
 			return sw_event_names[config];
@@ -250,6 +268,19 @@ static int check_events(const char *str, unsigned int i)
 	return 0;
 }
 
+static int check_fpu_events(const char *str, unsigned int i)
+{
+	if (!strncmp(str, fpu_event_symbols[i].symbol,
+		     strlen(fpu_event_symbols[i].symbol)))
+		return 1;
+
+	if (strlen(fpu_event_symbols[i].alias))
+		if (!strncmp(str, fpu_event_symbols[i].alias,
+			     strlen(fpu_event_symbols[i].alias)))
+			return 1;
+	return 0;
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
@@ -297,6 +328,15 @@ static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
 		}
 	}
 
+	for (i = 0; i < ARRAY_SIZE(fpu_event_symbols); i++) {
+		if (check_fpu_events(str, i)) {
+			attr->type = fpu_event_symbols[i].type;
+			attr->config = fpu_event_symbols[i].config;
+
+			return 0;
+		}
+	}
+
 	return parse_generic_hw_symbols(str, attr);
 }
 
-- 
1.6.0.6




^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-06-29  9:33 [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD Jaswinder Singh Rajput
@ 2009-06-30 10:11 ` Ingo Molnar
  2009-06-30 13:20   ` Jaswinder Singh Rajput
  0 siblings, 1 reply; 9+ messages in thread
From: Ingo Molnar @ 2009-06-30 10:11 UTC (permalink / raw)
  To: Jaswinder Singh Rajput
  Cc: Thomas Gleixner, Peter Zijlstra, x86 maintainers, LKML


* Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:

>  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> 
>  Performance counter stats for 'ls -lR /usr/include/':
> 
>            7335  add                       (   2.00x scaled)
>            8012  multiply                  (   1.99x scaled)
>            5229  fpu-store                 (   2.00x scaled)
>       793097355  fpu-empty                 (   2.00x scaled)
>             182  fpu-busy                  (   2.00x scaled)
>               6  x87                       (   2.01x scaled)
>               4  mmx-3dnow                 (   2.00x scaled)
>            8933  sse-sse2                  (   2.00x scaled)
> 
>     0.393548820  seconds time elapsed
> 
>  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> 
>  Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> 
>        19583739  add                       (   2.01x scaled)
>        20856051  multiply                  (   2.01x scaled)
>        18669503  fpu-store                 (   2.00x scaled)
>     25100224054  fpu-empty                 (   1.99x scaled)
>        12540131  fpu-busy                  (   1.99x scaled)
>          207228  x87                       (   1.99x scaled)
>         1768418  mmx-3dnow                 (   2.00x scaled)
>        42286702  sse-sse2                  (   2.01x scaled)
> 
>   302.698647617  seconds time elapsed
> 
>  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> 
>  Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> 
>      6572682335  add                       (   2.00x scaled)
>     11131555181  multiply                  (   2.00x scaled)
>      1317520699  fpu-store                 (   2.00x scaled)
>      9089415134  fpu-empty                 (   1.99x scaled)
>      2902772713  fpu-busy                  (   2.00x scaled)
>           26047  x87                       (   2.00x scaled)
>     24850978532  mmx-3dnow                 (   2.00x scaled)
>       262276117  sse-sse2                  (   2.01x scaled)
> 
>    96.169312358  seconds time elapsed
> 
> Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
> ---
>  arch/x86/kernel/cpu/perf_counter.c |   34 ++++++++++++++++++++++++++++++
>  include/linux/perf_counter.h       |   17 +++++++++++++++
>  kernel/perf_counter.c              |    1 +
>  tools/perf/util/parse-events.c     |   40 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 92 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> index b83474b..4417edf 100644
> --- a/arch/x86/kernel/cpu/perf_counter.c
> +++ b/arch/x86/kernel/cpu/perf_counter.c
> @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
>   },
>  };
>  
> +/*
> + * Generalized hw fpu event table
> + */
> +
> +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];

ok, this looks genuinely useful, but there are some gaps. Where's 
the divides? Plus things like mmx-3dnow are AMD specific, sse-sse2 
is x86 specific. We definitely want this general table, but the 
events should be truly general.

Also, how would this look like on Intel, roughly?

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-06-30 10:11 ` Ingo Molnar
@ 2009-06-30 13:20   ` Jaswinder Singh Rajput
  2009-06-30 14:56     ` Jaswinder Singh Rajput
  2009-06-30 22:42     ` Ingo Molnar
  0 siblings, 2 replies; 9+ messages in thread
From: Jaswinder Singh Rajput @ 2009-06-30 13:20 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, Peter Zijlstra, x86 maintainers, LKML

On Tue, 2009-06-30 at 12:11 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> 
> >  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> > 
> >  Performance counter stats for 'ls -lR /usr/include/':
> > 
> >            7335  add                       (   2.00x scaled)
> >            8012  multiply                  (   1.99x scaled)
> >            5229  fpu-store                 (   2.00x scaled)
> >       793097355  fpu-empty                 (   2.00x scaled)
> >             182  fpu-busy                  (   2.00x scaled)
> >               6  x87                       (   2.01x scaled)
> >               4  mmx-3dnow                 (   2.00x scaled)
> >            8933  sse-sse2                  (   2.00x scaled)
> > 
> >     0.393548820  seconds time elapsed
> > 
> >  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> > 
> >  Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > 
> >        19583739  add                       (   2.01x scaled)
> >        20856051  multiply                  (   2.01x scaled)
> >        18669503  fpu-store                 (   2.00x scaled)
> >     25100224054  fpu-empty                 (   1.99x scaled)
> >        12540131  fpu-busy                  (   1.99x scaled)
> >          207228  x87                       (   1.99x scaled)
> >         1768418  mmx-3dnow                 (   2.00x scaled)
> >        42286702  sse-sse2                  (   2.01x scaled)
> > 
> >   302.698647617  seconds time elapsed
> > 
> >  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> > 
> >  Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> > 
> >      6572682335  add                       (   2.00x scaled)
> >     11131555181  multiply                  (   2.00x scaled)
> >      1317520699  fpu-store                 (   2.00x scaled)
> >      9089415134  fpu-empty                 (   1.99x scaled)
> >      2902772713  fpu-busy                  (   2.00x scaled)
> >           26047  x87                       (   2.00x scaled)
> >     24850978532  mmx-3dnow                 (   2.00x scaled)
> >       262276117  sse-sse2                  (   2.01x scaled)
> > 
> >    96.169312358  seconds time elapsed
> > 
> > Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
> > ---
> >  arch/x86/kernel/cpu/perf_counter.c |   34 ++++++++++++++++++++++++++++++
> >  include/linux/perf_counter.h       |   17 +++++++++++++++
> >  kernel/perf_counter.c              |    1 +
> >  tools/perf/util/parse-events.c     |   40 ++++++++++++++++++++++++++++++++++++
> >  4 files changed, 92 insertions(+), 0 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> > index b83474b..4417edf 100644
> > --- a/arch/x86/kernel/cpu/perf_counter.c
> > +++ b/arch/x86/kernel/cpu/perf_counter.c
> > @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> >   },
> >  };
> >  
> > +/*
> > + * Generalized hw fpu event table
> > + */
> > +
> > +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
> 
> ok, this looks genuinely useful, but there are some gaps. Where's 
> the divides? 

I was also surprised divide is not available for AMD. Thats why I did
not included it. You are right it should be there.

> Plus things like mmx-3dnow are AMD specific, sse-sse2 
> is x86 specific. We definitely want this general table, but the 
> events should be truly general.
> 

mmx and sse are available for both Intel and AMD. Thats why I added both
of them. Is it OK.

> Also, how would this look like on Intel, roughly?
> 

Intel have almost all of them + divide.

As you know I work from home and I do not have any Intel machine which
supports PMU.

Can you suggest your machine name so that I can prepare the FPU events
list for your machine and you can verify it on your side.

Thanks,
--
JSR



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-06-30 13:20   ` Jaswinder Singh Rajput
@ 2009-06-30 14:56     ` Jaswinder Singh Rajput
  2009-06-30 22:42     ` Ingo Molnar
  1 sibling, 0 replies; 9+ messages in thread
From: Jaswinder Singh Rajput @ 2009-06-30 14:56 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, Peter Zijlstra, x86 maintainers, LKML

On Tue, 2009-06-30 at 18:50 +0530, Jaswinder Singh Rajput wrote:
> On Tue, 2009-06-30 at 12:11 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:
> > 
> > >  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> > > 
> > >  Performance counter stats for 'ls -lR /usr/include/':
> > > 
> > >            7335  add                       (   2.00x scaled)
> > >            8012  multiply                  (   1.99x scaled)
> > >            5229  fpu-store                 (   2.00x scaled)
> > >       793097355  fpu-empty                 (   2.00x scaled)
> > >             182  fpu-busy                  (   2.00x scaled)
> > >               6  x87                       (   2.01x scaled)
> > >               4  mmx-3dnow                 (   2.00x scaled)
> > >            8933  sse-sse2                  (   2.00x scaled)
> > > 
> > >     0.393548820  seconds time elapsed
> > > 
> > >  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> > > 
> > >  Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > > 
> > >        19583739  add                       (   2.01x scaled)
> > >        20856051  multiply                  (   2.01x scaled)
> > >        18669503  fpu-store                 (   2.00x scaled)
> > >     25100224054  fpu-empty                 (   1.99x scaled)
> > >        12540131  fpu-busy                  (   1.99x scaled)
> > >          207228  x87                       (   1.99x scaled)
> > >         1768418  mmx-3dnow                 (   2.00x scaled)
> > >        42286702  sse-sse2                  (   2.01x scaled)
> > > 
> > >   302.698647617  seconds time elapsed
> > > 
> > >  $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> > > 
> > >  Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> > > 
> > >      6572682335  add                       (   2.00x scaled)
> > >     11131555181  multiply                  (   2.00x scaled)
> > >      1317520699  fpu-store                 (   2.00x scaled)
> > >      9089415134  fpu-empty                 (   1.99x scaled)
> > >      2902772713  fpu-busy                  (   2.00x scaled)
> > >           26047  x87                       (   2.00x scaled)
> > >     24850978532  mmx-3dnow                 (   2.00x scaled)
> > >       262276117  sse-sse2                  (   2.01x scaled)
> > > 
> > >    96.169312358  seconds time elapsed
> > > 
> > > Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
> > > ---
> > >  arch/x86/kernel/cpu/perf_counter.c |   34 ++++++++++++++++++++++++++++++
> > >  include/linux/perf_counter.h       |   17 +++++++++++++++
> > >  kernel/perf_counter.c              |    1 +
> > >  tools/perf/util/parse-events.c     |   40 ++++++++++++++++++++++++++++++++++++
> > >  4 files changed, 92 insertions(+), 0 deletions(-)
> > > 
> > > diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> > > index b83474b..4417edf 100644
> > > --- a/arch/x86/kernel/cpu/perf_counter.c
> > > +++ b/arch/x86/kernel/cpu/perf_counter.c
> > > @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> > >   },
> > >  };
> > >  
> > > +/*
> > > + * Generalized hw fpu event table
> > > + */
> > > +
> > > +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
> > 
> > ok, this looks genuinely useful, but there are some gaps. Where's 
> > the divides? 
> 
> I was also surprised divide is not available for AMD. Thats why I did
> not included it. You are right it should be there.
> 

In AMD FPU operations include add, multiple and store.
Can I use store as divide for AMD, samples I shown above seems like they
are divide.

> > Plus things like mmx-3dnow are AMD specific, sse-sse2 
> > is x86 specific. We definitely want this general table, but the 
> > events should be truly general.
> > 
> 
> mmx and sse are available for both Intel and AMD. Thats why I added both
> of them. Is it OK.
> 

Is this looks :

 enum perf_hw_fpu_id {
       PERF_COUNT_HW_FPU_ADD                   = 0,
       PERF_COUNT_HW_FPU_MULTIPLY              = 1,
       PERF_COUNT_HW_FPU_DIVIDE                = 2,
       PERF_COUNT_HW_FPU_EMPTY                 = 3,
       PERF_COUNT_HW_FPU_STALL                 = 4,
       PERF_COUNT_HW_FPU_X87                   = 5,
       PERF_COUNT_HW_FPU_MMX                   = 6,
       PERF_COUNT_HW_FPU_SSE                   = 7,

       PERF_COUNT_HW_FPU_MAX,                  /* non-ABI */


> > Also, how would this look like on Intel, roughly?
> > 
> 
> Intel have almost all of them + divide.
> 
> As you know I work from home and I do not have any Intel machine which
> supports PMU.
> 
> Can you suggest your machine name so that I can prepare the FPU events
> list for your machine and you can verify it on your side.
> 

For Nehalem it will look like :

 static const u64 nehalem_hw_fpu_event_ids[] =
 {
  [PERF_COUNT_HW_FPU_ADD]               = 0x01B1, /* UOPS_EXECUTED.PORT0     */
  [PERF_COUNT_HW_FPU_MULTIPLY]          = 0x0214, /* ARITH.MUL               */
  [PERF_COUNT_HW_FPU_DIVIDE]            = 0x0114, /* ARITH.CYCLES_DIV_BUSY   */
  [PERF_COUNT_HW_FPU_EMPTY]             = 0x0,
  [PERF_COUNT_HW_FPU_STALL]             = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
  [PERF_COUNT_HW_FPU_X87]               = 0x0110, /* FP_COMP_OPS_EXE.X87     */
  [PERF_COUNT_HW_FPU_MMX]               = 0x0210, /* FP_COMP_OPS_EXE.MMX     */
  [PERF_COUNT_HW_FPU_SSE]               = 0x0410, /* FP_COMP_OPS_EXE.SSE_FP  */
};

Is these looks OK to you. Can I resend the patch based on these.

Thanks,
--
JSR


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-06-30 13:20   ` Jaswinder Singh Rajput
  2009-06-30 14:56     ` Jaswinder Singh Rajput
@ 2009-06-30 22:42     ` Ingo Molnar
  2009-06-30 23:14       ` Alan Cox
  2009-07-01 12:33       ` Paul Mackerras
  1 sibling, 2 replies; 9+ messages in thread
From: Ingo Molnar @ 2009-06-30 22:42 UTC (permalink / raw)
  To: Jaswinder Singh Rajput, Paul Mackerras
  Cc: Thomas Gleixner, Peter Zijlstra, x86 maintainers, LKML


* Jaswinder Singh Rajput <jaswinder@kernel.org> wrote:

> > Plus things like mmx-3dnow are AMD specific, sse-sse2 is x86 
> > specific. We definitely want this general table, but the events 
> > should be truly general.
> 
> mmx and sse are available for both Intel and AMD. Thats why I 
> added both of them. Is it OK.

'3dnow' is an AMD marketing term. (and is long obsolete)

Nor did you answer (or understand) my sentence above: 'sse' is an 
x86 specific term.

I think a naming and enumeration scheme around the general concept 
of 'vectored co-processor' would be far less x86 specific.

Mockup:

        19583739  vec-adds               (   2.01x scaled)
        20856051  vec-muls               (   2.01x scaled)
        20856051  vec-divs               (   2.01x scaled)
     25100224054  vec-idle-cycles        (   1.99x scaled)
        12540131  vec-busy-cycles        (   1.99x scaled)
        42286702  vec-ops                (   2.01x scaled)

Paulus: would this categorization fit PowerPC too?

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-06-30 22:42     ` Ingo Molnar
@ 2009-06-30 23:14       ` Alan Cox
  2009-07-01 12:33       ` Paul Mackerras
  1 sibling, 0 replies; 9+ messages in thread
From: Alan Cox @ 2009-06-30 23:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jaswinder Singh Rajput, Paul Mackerras, Thomas Gleixner,
	Peter Zijlstra, x86 maintainers, LKML

> '3dnow' is an AMD marketing term. (and is long obsolete)

And an instruction set extension as well so different to SSE

(also be careful of MMX as there is MMX and MMX-EXT (Cyrix/AMD).

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-06-30 22:42     ` Ingo Molnar
  2009-06-30 23:14       ` Alan Cox
@ 2009-07-01 12:33       ` Paul Mackerras
  2009-07-01 13:12         ` Ingo Molnar
  2009-07-01 13:25         ` Jaswinder Singh Rajput
  1 sibling, 2 replies; 9+ messages in thread
From: Paul Mackerras @ 2009-07-01 12:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jaswinder Singh Rajput, Thomas Gleixner, Peter Zijlstra,
	x86 maintainers, LKML

Ingo Molnar writes:

>         19583739  vec-adds               (   2.01x scaled)
>         20856051  vec-muls               (   2.01x scaled)
>         20856051  vec-divs               (   2.01x scaled)
>      25100224054  vec-idle-cycles        (   1.99x scaled)
>         12540131  vec-busy-cycles        (   1.99x scaled)
>         42286702  vec-ops                (   2.01x scaled)
> 
> Paulus: would this categorization fit PowerPC too?

Conceptually that looks nice, but unfortunately we don't have events
that correspond to that categorization on any PowerPC with vector
hardware (VMX/Altivec).  POWER6 seems to have the most vector events,
and they are mostly divided up along the lines of simple / complex /
permute / load / store operations, and whether they are integer or
floating-point operations.

Here are the vector-related events we have on POWER6:

MRK_VMX0_LD_WRBACK	Marked VMX0 load writeback valid
MRK_VMX1_LD_WRBACK	Marked VMX1 load writeback valid
MRK_VMX_COMPLEX_ISSUED	Marked VMX instruction issued to complex
MRK_VMX_FLOAT_ISSUED	Marked VMX instruction issued to float
MRK_VMX_PERMUTE_ISSUED	Marked VMX instruction issued to permute
MRK_VMX_SIMPLE_ISSUED	Marked VMX instruction issued to simple
MRK_VMX_ST_ISSUED	Marked VMX store issued
VMX0_INST_ISSUED	VMX0 instruction issued
VMX0_LD_ISSUED		VMX0 load issued
VMX0_LD_WRBACK		VMX0 load writeback valid
VMX0_STALL		VMX0 stall
VMX1_INST_ISSUED	VMX1 instruction issued
VMX1_LD_ISSUED		VMX1 load issued
VMX1_LD_WRBACK		VMX1 load writeback valid
VMX1_STALL		VMX1 stall
VMX_COMPLEX_ISSUED	VMX instruction issued to complex
VMX_FLOAT_ISSUED	VMX instruction issued to float
VMX_FLOAT_MULTICYCLE	VMX multi-cycle floating point instruction issued
VMX_PERMUTE_ISSUED	VMX instruction issued to permute
VMX_RESULT_SAT_0_1	VMX valid result with sat bit is set (0->1)
VMX_RESULT_SAT_1	VMX valid result with sat=1
VMX_SIMPLE_ISSUED	VMX instruction issued to simple
VMX_ST_ISSUED		VMX store issued

I'm not sure what the exact distinction is between VMX0 and VMX1.
I'll find out.

The MPC7450 (G4, 32-bit) cpu also has quite a few VMX/Altivec events,
such as counts of cycles that individual vector units are waiting for
operands, but not counts of how many vector add or vector multiply
operations are done.

Paul.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-07-01 12:33       ` Paul Mackerras
@ 2009-07-01 13:12         ` Ingo Molnar
  2009-07-01 13:25         ` Jaswinder Singh Rajput
  1 sibling, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2009-07-01 13:12 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Jaswinder Singh Rajput, Thomas Gleixner, Peter Zijlstra,
	x86 maintainers, LKML


* Paul Mackerras <paulus@samba.org> wrote:

> Ingo Molnar writes:
> 
> >         19583739  vec-adds               (   2.01x scaled)
> >         20856051  vec-muls               (   2.01x scaled)
> >         20856051  vec-divs               (   2.01x scaled)
> >      25100224054  vec-idle-cycles        (   1.99x scaled)
> >         12540131  vec-busy-cycles        (   1.99x scaled)
> >         42286702  vec-ops                (   2.01x scaled)
> > 
> > Paulus: would this categorization fit PowerPC too?
> 
> Conceptually that looks nice, but unfortunately we don't have 
> events that correspond to that categorization on any PowerPC with 
> vector hardware (VMX/Altivec).  POWER6 seems to have the most 
> vector events, and they are mostly divided up along the lines of 
> simple / complex / permute / load / store operations, and whether 
> they are integer or floating-point operations.

Here's what we have on x86:

     20177177044  vec-adds                  (scaled from 66.63%)
     34101687027  vec-muls                  (scaled from 66.64%)
      3984060862  vec-divs                  (scaled from 66.71%)
     26349684710  vec-idle-cycles           (scaled from 66.65%)
      9052001905  vec-stall-cycles          (scaled from 66.66%)
     76440734242  vec-ops                   (scaled from 66.71%)

Could at least the idle/busy/stall/total generic stats be filled in 
on powerpc, with a reasonable enough approximation? Those 
utilization metrics are the most important ones when one tries to 
figure out how well utilized the vector units are.

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD
  2009-07-01 12:33       ` Paul Mackerras
  2009-07-01 13:12         ` Ingo Molnar
@ 2009-07-01 13:25         ` Jaswinder Singh Rajput
  1 sibling, 0 replies; 9+ messages in thread
From: Jaswinder Singh Rajput @ 2009-07-01 13:25 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Ingo Molnar, Thomas Gleixner, Peter Zijlstra, x86 maintainers,
	LKML

On Wed, 2009-07-01 at 22:33 +1000, Paul Mackerras wrote:
> Ingo Molnar writes:
> 
> >         19583739  vec-adds               (   2.01x scaled)
> >         20856051  vec-muls               (   2.01x scaled)
> >         20856051  vec-divs               (   2.01x scaled)
> >      25100224054  vec-idle-cycles        (   1.99x scaled)
> >         12540131  vec-busy-cycles        (   1.99x scaled)
> >         42286702  vec-ops                (   2.01x scaled)
> > 
> > Paulus: would this categorization fit PowerPC too?
> 
> Conceptually that looks nice, but unfortunately we don't have events
> that correspond to that categorization on any PowerPC with vector
> hardware (VMX/Altivec).  POWER6 seems to have the most vector events,
> and they are mostly divided up along the lines of simple / complex /
> permute / load / store operations, and whether they are integer or
> floating-point operations.
> 
> Here are the vector-related events we have on POWER6:
> 
> MRK_VMX0_LD_WRBACK	Marked VMX0 load writeback valid
> MRK_VMX1_LD_WRBACK	Marked VMX1 load writeback valid
> MRK_VMX_COMPLEX_ISSUED	Marked VMX instruction issued to complex
> MRK_VMX_FLOAT_ISSUED	Marked VMX instruction issued to float
> MRK_VMX_PERMUTE_ISSUED	Marked VMX instruction issued to permute
> MRK_VMX_SIMPLE_ISSUED	Marked VMX instruction issued to simple
> MRK_VMX_ST_ISSUED	Marked VMX store issued
> VMX0_INST_ISSUED	VMX0 instruction issued
> VMX0_LD_ISSUED		VMX0 load issued
> VMX0_LD_WRBACK		VMX0 load writeback valid
> VMX0_STALL		VMX0 stall
> VMX1_INST_ISSUED	VMX1 instruction issued
> VMX1_LD_ISSUED		VMX1 load issued
> VMX1_LD_WRBACK		VMX1 load writeback valid
> VMX1_STALL		VMX1 stall
> VMX_COMPLEX_ISSUED	VMX instruction issued to complex
> VMX_FLOAT_ISSUED	VMX instruction issued to float
> VMX_FLOAT_MULTICYCLE	VMX multi-cycle floating point instruction issued
> VMX_PERMUTE_ISSUED	VMX instruction issued to permute
> VMX_RESULT_SAT_0_1	VMX valid result with sat bit is set (0->1)
> VMX_RESULT_SAT_1	VMX valid result with sat=1
> VMX_SIMPLE_ISSUED	VMX instruction issued to simple
> VMX_ST_ISSUED		VMX store issued
> 
> I'm not sure what the exact distinction is between VMX0 and VMX1.
> I'll find out.
> 

I am just guessing for powerpc, normally different units are for
different purpose like some do addition/multiplication and others do
division.

Like in Intel Corei7/Nehalem :

UOPS_EXECUTED.PORT0:     Counts number of Uops executed
                         that were issued on port 0. Port 0
                         handles integer arithmetic, SIMD
                         and FP add Uops.
UOPS_EXECUTED.PORT1:     Counts number of Uops executed
                         that were issued on port 1. Port 1
                         handles integer arithmetic, SIMD,
                         integer shift, FP multiply and FP
                         divide Uops.

Can you provide me the link of the Hardware manual so that I can check
it out.

Thanks,
--
JSR


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2009-07-01 13:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-29  9:33 [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD Jaswinder Singh Rajput
2009-06-30 10:11 ` Ingo Molnar
2009-06-30 13:20   ` Jaswinder Singh Rajput
2009-06-30 14:56     ` Jaswinder Singh Rajput
2009-06-30 22:42     ` Ingo Molnar
2009-06-30 23:14       ` Alan Cox
2009-07-01 12:33       ` Paul Mackerras
2009-07-01 13:12         ` Ingo Molnar
2009-07-01 13:25         ` Jaswinder Singh Rajput

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox