Linux Perf Users
 help / color / mirror / Atom feed
* [PATCH] perf intel-pt: don't zero the whole perf_sample
@ 2025-01-11 17:56 Tavian Barnes
  2025-01-13  8:15 ` Adrian Hunter
  0 siblings, 1 reply; 5+ messages in thread
From: Tavian Barnes @ 2025-01-11 17:56 UTC (permalink / raw)
  To: linux-perf-users
  Cc: Tavian Barnes, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Liang, Kan, Andrew Kreimer, linux-kernel

C designated initializers like

    struct perf_sample sample = { .ip = 0, };

set every unmentioned field of the struct to zero.  But since
sizeof(struct perf_sample) == 1384, this takes a long time.

struct perf_sample does not need to be fully initialized, and even
.ip = 0 is unnecessary because intel_pt_prep_*_sample() will initialize
it.  Skipping the initialization saves about 2.5% of the execution time
when running

    $ perf script --itrace=i0

Signed-off-by: Tavian Barnes <tavianator@tavianator.com>
---
 tools/perf/util/intel-pt.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 30be6dfe09eb..c829398c5bb9 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -1764,7 +1764,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct dummy_branch_stack {
 		u64			nr;
 		u64			hw_idx;
@@ -1835,7 +1835,7 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 
 	if (intel_pt_skip_event(pt))
 		return 0;
@@ -1867,7 +1867,7 @@ static int intel_pt_synth_cycle_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	u64 period = 0;
 
 	if (ptq->sample_ipc)
@@ -1894,7 +1894,7 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 
 	if (intel_pt_skip_event(pt))
 		return 0;
@@ -1927,7 +1927,7 @@ static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_ptwrite raw;
 
 	if (intel_pt_skip_event(pt))
@@ -1953,7 +1953,7 @@ static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_cbr raw;
 	u32 flags;
 
@@ -1983,7 +1983,7 @@ static int intel_pt_synth_psb_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_psb raw;
 
 	if (intel_pt_skip_event(pt))
@@ -2009,7 +2009,7 @@ static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_mwait raw;
 
 	if (intel_pt_skip_event(pt))
@@ -2034,7 +2034,7 @@ static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_pwre raw;
 
 	if (intel_pt_skip_event(pt))
@@ -2059,7 +2059,7 @@ static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_exstop raw;
 
 	if (intel_pt_skip_event(pt))
@@ -2084,7 +2084,7 @@ static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_pwrx raw;
 
 	if (intel_pt_skip_event(pt))
@@ -2235,7 +2235,7 @@ static void intel_pt_add_lbrs(struct branch_stack *br_stack,
 static int intel_pt_do_synth_pebs_sample(struct intel_pt_queue *ptq, struct evsel *evsel, u64 id)
 {
 	const struct intel_pt_blk_items *items = &ptq->state->items;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	union perf_event *event = ptq->event_buf;
 	struct intel_pt *pt = ptq->pt;
 	u64 sample_type = evsel->core.attr.sample_type;
@@ -2407,7 +2407,7 @@ static int intel_pt_synth_events_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct {
 		struct perf_synth_intel_evt cfe;
 		struct perf_synth_intel_evd evd[INTEL_PT_MAX_EVDS];
@@ -2446,7 +2446,7 @@ static int intel_pt_synth_iflag_chg_sample(struct intel_pt_queue *ptq)
 {
 	struct intel_pt *pt = ptq->pt;
 	union perf_event *event = ptq->event_buf;
-	struct perf_sample sample = { .ip = 0, };
+	struct perf_sample sample;
 	struct perf_synth_intel_iflag_chg raw;
 
 	if (intel_pt_skip_event(pt))
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
  2025-01-11 17:56 [PATCH] perf intel-pt: don't zero the whole perf_sample Tavian Barnes
@ 2025-01-13  8:15 ` Adrian Hunter
  2025-01-13 16:26   ` Tavian Barnes
  2025-01-13 17:30   ` Ian Rogers
  0 siblings, 2 replies; 5+ messages in thread
From: Adrian Hunter @ 2025-01-13  8:15 UTC (permalink / raw)
  To: Tavian Barnes, linux-perf-users
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Liang, Kan, Andrew Kreimer, linux-kernel

On 11/01/25 19:56, Tavian Barnes wrote:
> C designated initializers like
> 
>     struct perf_sample sample = { .ip = 0, };
> 
> set every unmentioned field of the struct to zero.  But since
> sizeof(struct perf_sample) == 1384, this takes a long time.
> 
> struct perf_sample does not need to be fully initialized, and even

Yes it does need to be fully initialized.  Leaving members
uninitialized in the hope that they never get used adds to
code complexity e.g. how do you know they never are used,
or future members never will be used.

> .ip = 0 is unnecessary because intel_pt_prep_*_sample() will initialize
> it.  Skipping the initialization saves about 2.5% of the execution time
> when running
> 
>     $ perf script --itrace=i0
> 
> Signed-off-by: Tavian Barnes <tavianator@tavianator.com>
> ---
>  tools/perf/util/intel-pt.c | 28 ++++++++++++++--------------
>  1 file changed, 14 insertions(+), 14 deletions(-)
> 
> diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
> index 30be6dfe09eb..c829398c5bb9 100644
> --- a/tools/perf/util/intel-pt.c
> +++ b/tools/perf/util/intel-pt.c
> @@ -1764,7 +1764,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct dummy_branch_stack {
>  		u64			nr;
>  		u64			hw_idx;
> @@ -1835,7 +1835,7 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  
>  	if (intel_pt_skip_event(pt))
>  		return 0;
> @@ -1867,7 +1867,7 @@ static int intel_pt_synth_cycle_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	u64 period = 0;
>  
>  	if (ptq->sample_ipc)
> @@ -1894,7 +1894,7 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  
>  	if (intel_pt_skip_event(pt))
>  		return 0;
> @@ -1927,7 +1927,7 @@ static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_ptwrite raw;
>  
>  	if (intel_pt_skip_event(pt))
> @@ -1953,7 +1953,7 @@ static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_cbr raw;
>  	u32 flags;
>  
> @@ -1983,7 +1983,7 @@ static int intel_pt_synth_psb_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_psb raw;
>  
>  	if (intel_pt_skip_event(pt))
> @@ -2009,7 +2009,7 @@ static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_mwait raw;
>  
>  	if (intel_pt_skip_event(pt))
> @@ -2034,7 +2034,7 @@ static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_pwre raw;
>  
>  	if (intel_pt_skip_event(pt))
> @@ -2059,7 +2059,7 @@ static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_exstop raw;
>  
>  	if (intel_pt_skip_event(pt))
> @@ -2084,7 +2084,7 @@ static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_pwrx raw;
>  
>  	if (intel_pt_skip_event(pt))
> @@ -2235,7 +2235,7 @@ static void intel_pt_add_lbrs(struct branch_stack *br_stack,
>  static int intel_pt_do_synth_pebs_sample(struct intel_pt_queue *ptq, struct evsel *evsel, u64 id)
>  {
>  	const struct intel_pt_blk_items *items = &ptq->state->items;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	union perf_event *event = ptq->event_buf;
>  	struct intel_pt *pt = ptq->pt;
>  	u64 sample_type = evsel->core.attr.sample_type;
> @@ -2407,7 +2407,7 @@ static int intel_pt_synth_events_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct {
>  		struct perf_synth_intel_evt cfe;
>  		struct perf_synth_intel_evd evd[INTEL_PT_MAX_EVDS];
> @@ -2446,7 +2446,7 @@ static int intel_pt_synth_iflag_chg_sample(struct intel_pt_queue *ptq)
>  {
>  	struct intel_pt *pt = ptq->pt;
>  	union perf_event *event = ptq->event_buf;
> -	struct perf_sample sample = { .ip = 0, };
> +	struct perf_sample sample;
>  	struct perf_synth_intel_iflag_chg raw;
>  
>  	if (intel_pt_skip_event(pt))


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
  2025-01-13  8:15 ` Adrian Hunter
@ 2025-01-13 16:26   ` Tavian Barnes
  2025-01-13 17:30   ` Ian Rogers
  1 sibling, 0 replies; 5+ messages in thread
From: Tavian Barnes @ 2025-01-13 16:26 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: linux-perf-users, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Liang, Kan,
	Andrew Kreimer, linux-kernel

On Mon, Jan 13, 2025 at 3:15 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 11/01/25 19:56, Tavian Barnes wrote:
> > C designated initializers like
> >
> >     struct perf_sample sample = { .ip = 0, };
> >
> > set every unmentioned field of the struct to zero.  But since
> > sizeof(struct perf_sample) == 1384, this takes a long time.
> >
> > struct perf_sample does not need to be fully initialized, and even
>
> Yes it does need to be fully initialized.  Leaving members
> uninitialized in the hope that they never get used adds to
> code complexity e.g. how do you know they never are used,
> or future members never will be used.

Right, please ignore this patch then.  I thought I had seen other
cases where perf_sample was not fully initialized but I wasn't reading
the code carefully enough.  Sorry for the noise!

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
  2025-01-13  8:15 ` Adrian Hunter
  2025-01-13 16:26   ` Tavian Barnes
@ 2025-01-13 17:30   ` Ian Rogers
  2025-01-13 19:45     ` Ian Rogers
  1 sibling, 1 reply; 5+ messages in thread
From: Ian Rogers @ 2025-01-13 17:30 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Tavian Barnes, linux-perf-users, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Liang, Kan, Andrew Kreimer,
	linux-kernel

On Mon, Jan 13, 2025 at 12:15 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
>
> On 11/01/25 19:56, Tavian Barnes wrote:
> > C designated initializers like
> >
> >     struct perf_sample sample = { .ip = 0, };
> >
> > set every unmentioned field of the struct to zero.  But since
> > sizeof(struct perf_sample) == 1384, this takes a long time.
> >
> > struct perf_sample does not need to be fully initialized, and even
>
> Yes it does need to be fully initialized.  Leaving members
> uninitialized in the hope that they never get used adds to
> code complexity e.g. how do you know they never are used,
> or future members never will be used.

First thought, perhaps we can use sanitizers to catch mistakes but
leave the default optimized code to not have any overhead.

Second thought, the perf_sample looks like:
```
$ pahole perf
...
struct perf_sample {
       u64                        ip;                   /*     0     8 */
       u32                        pid;                  /*     8     4 */
       u32                        tid;                  /*    12     4 */
       u64                        time;                 /*    16     8 */
       u64                        addr;                 /*    24     8 */
       u64                        id;                   /*    32     8 */
       u64                        stream_id;            /*    40     8 */
       u64                        period;               /*    48     8 */
       u64                        weight;               /*    56     8 */
       /* --- cacheline 1 boundary (64 bytes) --- */
       u64                        transaction;          /*    64     8 */
       u64                        insn_cnt;             /*    72     8 */
       u64                        cyc_cnt;              /*    80     8 */
       u32                        cpu;                  /*    88     4 */
       u32                        raw_size;             /*    92     4 */
       u64                        data_src;             /*    96     8 */
       u64                        phys_addr;            /*   104     8 */
       u64                        data_page_size;       /*   112     8 */
       u64                        code_page_size;       /*   120     8 */
       /* --- cacheline 2 boundary (128 bytes) --- */
       u64                        cgroup;               /*   128     8 */
       u32                        flags;                /*   136     4 */
       u32                        machine_pid;          /*   140     4 */
       u32                        vcpu;                 /*   144     4 */
       u16                        insn_len;             /*   148     2 */
       u8                         cpumode;              /*   150     1 */

       /* XXX 1 byte hole, try to pack */

       u16                        misc;                 /*   152     2 */
       u16                        ins_lat;              /*   154     2 */
       union {
               u16                p_stage_cyc;          /*   156     2 */
               u16                retire_lat;           /*   156     2 */
       };                                               /*   156     2 */
       union {
               u16                        p_stage_cyc;          /*
0     2 */
               u16                        retire_lat;           /*
0     2 */
       };

       _Bool                      no_hw_idx;            /*   158     1 */
       char                       insn[16];             /*   159    16 */

       /* XXX 1 byte hole, try to pack */

       void *                     raw_data;             /*   176     8 */
       struct ip_callchain *      callchain;            /*   184     8 */
       /* --- cacheline 3 boundary (192 bytes) --- */
       struct branch_stack *      branch_stack;         /*   192     8 */
       u64 *                      branch_stack_cntr;    /*   200     8 */
       struct regs_dump           user_regs;            /*   208   544 */
       /* --- cacheline 11 boundary (704 bytes) was 48 bytes ago --- */
       struct regs_dump           intr_regs;            /*   752   544 */
       /* --- cacheline 20 boundary (1280 bytes) was 16 bytes ago --- */
       struct stack_dump          user_stack;           /*  1296    24 */
       struct sample_read         read;                 /*  1320    40 */
        /* --- cacheline 21 boundary (1344 bytes) was 16 bytes ago --- */
       struct aux_sample          aux_sample;           /*  1360    16 */
       struct simd_flags          simd_flags;           /*  1376     8 */

       /* size: 1384, cachelines: 22, members: 39 */
       /* sum members: 1382, holes: 2, sum holes: 2 */
       /* last cacheline: 40 bytes */
};
```
The two regs_dump structs are 79% of the size. They comprise of:
```
struct regs_dump {
       u64                        abi;                  /*     0     8 */
       u64                        mask;                 /*     8     8 */
       u64 *                      regs;                 /*    16     8 */
       u64                        cache_regs[64];       /*    24   512 */
       /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */
       u64                        cache_mask;           /*   536     8 */

       /* size: 544, cachelines: 9, members: 5 */
       /* last cacheline: 32 bytes */
};
```
So the cache_regs are taking up the majority of the space. I have a
WIP patch set to make user_regs and intr_regs optional/lazily
allocated. I'll send it out shortly.

Thanks,
Ian

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
  2025-01-13 17:30   ` Ian Rogers
@ 2025-01-13 19:45     ` Ian Rogers
  0 siblings, 0 replies; 5+ messages in thread
From: Ian Rogers @ 2025-01-13 19:45 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Tavian Barnes, linux-perf-users, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Liang, Kan, Andrew Kreimer,
	linux-kernel

On Mon, Jan 13, 2025 at 9:30 AM Ian Rogers <irogers@google.com> wrote:
>
> On Mon, Jan 13, 2025 at 12:15 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
> >
> > On 11/01/25 19:56, Tavian Barnes wrote:
> > > C designated initializers like
> > >
> > >     struct perf_sample sample = { .ip = 0, };
> > >
> > > set every unmentioned field of the struct to zero.  But since
> > > sizeof(struct perf_sample) == 1384, this takes a long time.
> > >
> > > struct perf_sample does not need to be fully initialized, and even
> >
> > Yes it does need to be fully initialized.  Leaving members
> > uninitialized in the hope that they never get used adds to
> > code complexity e.g. how do you know they never are used,
> > or future members never will be used.
>
> First thought, perhaps we can use sanitizers to catch mistakes but
> leave the default optimized code to not have any overhead.
>
> Second thought, the perf_sample looks like:
> ```
> $ pahole perf
> ...
> struct perf_sample {
>        u64                        ip;                   /*     0     8 */
>        u32                        pid;                  /*     8     4 */
>        u32                        tid;                  /*    12     4 */
>        u64                        time;                 /*    16     8 */
>        u64                        addr;                 /*    24     8 */
>        u64                        id;                   /*    32     8 */
>        u64                        stream_id;            /*    40     8 */
>        u64                        period;               /*    48     8 */
>        u64                        weight;               /*    56     8 */
>        /* --- cacheline 1 boundary (64 bytes) --- */
>        u64                        transaction;          /*    64     8 */
>        u64                        insn_cnt;             /*    72     8 */
>        u64                        cyc_cnt;              /*    80     8 */
>        u32                        cpu;                  /*    88     4 */
>        u32                        raw_size;             /*    92     4 */
>        u64                        data_src;             /*    96     8 */
>        u64                        phys_addr;            /*   104     8 */
>        u64                        data_page_size;       /*   112     8 */
>        u64                        code_page_size;       /*   120     8 */
>        /* --- cacheline 2 boundary (128 bytes) --- */
>        u64                        cgroup;               /*   128     8 */
>        u32                        flags;                /*   136     4 */
>        u32                        machine_pid;          /*   140     4 */
>        u32                        vcpu;                 /*   144     4 */
>        u16                        insn_len;             /*   148     2 */
>        u8                         cpumode;              /*   150     1 */
>
>        /* XXX 1 byte hole, try to pack */
>
>        u16                        misc;                 /*   152     2 */
>        u16                        ins_lat;              /*   154     2 */
>        union {
>                u16                p_stage_cyc;          /*   156     2 */
>                u16                retire_lat;           /*   156     2 */
>        };                                               /*   156     2 */
>        union {
>                u16                        p_stage_cyc;          /*
> 0     2 */
>                u16                        retire_lat;           /*
> 0     2 */
>        };
>
>        _Bool                      no_hw_idx;            /*   158     1 */
>        char                       insn[16];             /*   159    16 */
>
>        /* XXX 1 byte hole, try to pack */
>
>        void *                     raw_data;             /*   176     8 */
>        struct ip_callchain *      callchain;            /*   184     8 */
>        /* --- cacheline 3 boundary (192 bytes) --- */
>        struct branch_stack *      branch_stack;         /*   192     8 */
>        u64 *                      branch_stack_cntr;    /*   200     8 */
>        struct regs_dump           user_regs;            /*   208   544 */
>        /* --- cacheline 11 boundary (704 bytes) was 48 bytes ago --- */
>        struct regs_dump           intr_regs;            /*   752   544 */
>        /* --- cacheline 20 boundary (1280 bytes) was 16 bytes ago --- */
>        struct stack_dump          user_stack;           /*  1296    24 */
>        struct sample_read         read;                 /*  1320    40 */
>         /* --- cacheline 21 boundary (1344 bytes) was 16 bytes ago --- */
>        struct aux_sample          aux_sample;           /*  1360    16 */
>        struct simd_flags          simd_flags;           /*  1376     8 */
>
>        /* size: 1384, cachelines: 22, members: 39 */
>        /* sum members: 1382, holes: 2, sum holes: 2 */
>        /* last cacheline: 40 bytes */
> };
> ```
> The two regs_dump structs are 79% of the size. They comprise of:
> ```
> struct regs_dump {
>        u64                        abi;                  /*     0     8 */
>        u64                        mask;                 /*     8     8 */
>        u64 *                      regs;                 /*    16     8 */
>        u64                        cache_regs[64];       /*    24   512 */
>        /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */
>        u64                        cache_mask;           /*   536     8 */
>
>        /* size: 544, cachelines: 9, members: 5 */
>        /* last cacheline: 32 bytes */
> };
> ```
> So the cache_regs are taking up the majority of the space. I have a
> WIP patch set to make user_regs and intr_regs optional/lazily
> allocated. I'll send it out shortly.

Sent as:
https://lore.kernel.org/lkml/20250113194345.1537821-1-irogers@google.com/

Thanks,
Ian

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-01-13 19:45 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-11 17:56 [PATCH] perf intel-pt: don't zero the whole perf_sample Tavian Barnes
2025-01-13  8:15 ` Adrian Hunter
2025-01-13 16:26   ` Tavian Barnes
2025-01-13 17:30   ` Ian Rogers
2025-01-13 19:45     ` Ian Rogers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox