* [PATCH] perf intel-pt: don't zero the whole perf_sample
@ 2025-01-11 17:56 Tavian Barnes
2025-01-13 8:15 ` Adrian Hunter
0 siblings, 1 reply; 5+ messages in thread
From: Tavian Barnes @ 2025-01-11 17:56 UTC (permalink / raw)
To: linux-perf-users
Cc: Tavian Barnes, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
Liang, Kan, Andrew Kreimer, linux-kernel
C designated initializers like
struct perf_sample sample = { .ip = 0, };
set every unmentioned field of the struct to zero. But since
sizeof(struct perf_sample) == 1384, this takes a long time.
struct perf_sample does not need to be fully initialized, and even
.ip = 0 is unnecessary because intel_pt_prep_*_sample() will initialize
it. Skipping the initialization saves about 2.5% of the execution time
when running
$ perf script --itrace=i0
Signed-off-by: Tavian Barnes <tavianator@tavianator.com>
---
tools/perf/util/intel-pt.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 30be6dfe09eb..c829398c5bb9 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -1764,7 +1764,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct dummy_branch_stack {
u64 nr;
u64 hw_idx;
@@ -1835,7 +1835,7 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
if (intel_pt_skip_event(pt))
return 0;
@@ -1867,7 +1867,7 @@ static int intel_pt_synth_cycle_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
u64 period = 0;
if (ptq->sample_ipc)
@@ -1894,7 +1894,7 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
if (intel_pt_skip_event(pt))
return 0;
@@ -1927,7 +1927,7 @@ static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_ptwrite raw;
if (intel_pt_skip_event(pt))
@@ -1953,7 +1953,7 @@ static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_cbr raw;
u32 flags;
@@ -1983,7 +1983,7 @@ static int intel_pt_synth_psb_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_psb raw;
if (intel_pt_skip_event(pt))
@@ -2009,7 +2009,7 @@ static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_mwait raw;
if (intel_pt_skip_event(pt))
@@ -2034,7 +2034,7 @@ static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_pwre raw;
if (intel_pt_skip_event(pt))
@@ -2059,7 +2059,7 @@ static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_exstop raw;
if (intel_pt_skip_event(pt))
@@ -2084,7 +2084,7 @@ static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_pwrx raw;
if (intel_pt_skip_event(pt))
@@ -2235,7 +2235,7 @@ static void intel_pt_add_lbrs(struct branch_stack *br_stack,
static int intel_pt_do_synth_pebs_sample(struct intel_pt_queue *ptq, struct evsel *evsel, u64 id)
{
const struct intel_pt_blk_items *items = &ptq->state->items;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
union perf_event *event = ptq->event_buf;
struct intel_pt *pt = ptq->pt;
u64 sample_type = evsel->core.attr.sample_type;
@@ -2407,7 +2407,7 @@ static int intel_pt_synth_events_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct {
struct perf_synth_intel_evt cfe;
struct perf_synth_intel_evd evd[INTEL_PT_MAX_EVDS];
@@ -2446,7 +2446,7 @@ static int intel_pt_synth_iflag_chg_sample(struct intel_pt_queue *ptq)
{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
struct perf_synth_intel_iflag_chg raw;
if (intel_pt_skip_event(pt))
--
2.47.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
2025-01-11 17:56 [PATCH] perf intel-pt: don't zero the whole perf_sample Tavian Barnes
@ 2025-01-13 8:15 ` Adrian Hunter
2025-01-13 16:26 ` Tavian Barnes
2025-01-13 17:30 ` Ian Rogers
0 siblings, 2 replies; 5+ messages in thread
From: Adrian Hunter @ 2025-01-13 8:15 UTC (permalink / raw)
To: Tavian Barnes, linux-perf-users
Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
Ian Rogers, Liang, Kan, Andrew Kreimer, linux-kernel
On 11/01/25 19:56, Tavian Barnes wrote:
> C designated initializers like
>
> struct perf_sample sample = { .ip = 0, };
>
> set every unmentioned field of the struct to zero. But since
> sizeof(struct perf_sample) == 1384, this takes a long time.
>
> struct perf_sample does not need to be fully initialized, and even
Yes it does need to be fully initialized. Leaving members
uninitialized in the hope that they never get used adds to
code complexity e.g. how do you know they never are used,
or future members never will be used.
> .ip = 0 is unnecessary because intel_pt_prep_*_sample() will initialize
> it. Skipping the initialization saves about 2.5% of the execution time
> when running
>
> $ perf script --itrace=i0
>
> Signed-off-by: Tavian Barnes <tavianator@tavianator.com>
> ---
> tools/perf/util/intel-pt.c | 28 ++++++++++++++--------------
> 1 file changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
> index 30be6dfe09eb..c829398c5bb9 100644
> --- a/tools/perf/util/intel-pt.c
> +++ b/tools/perf/util/intel-pt.c
> @@ -1764,7 +1764,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct dummy_branch_stack {
> u64 nr;
> u64 hw_idx;
> @@ -1835,7 +1835,7 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
>
> if (intel_pt_skip_event(pt))
> return 0;
> @@ -1867,7 +1867,7 @@ static int intel_pt_synth_cycle_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> u64 period = 0;
>
> if (ptq->sample_ipc)
> @@ -1894,7 +1894,7 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
>
> if (intel_pt_skip_event(pt))
> return 0;
> @@ -1927,7 +1927,7 @@ static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_ptwrite raw;
>
> if (intel_pt_skip_event(pt))
> @@ -1953,7 +1953,7 @@ static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_cbr raw;
> u32 flags;
>
> @@ -1983,7 +1983,7 @@ static int intel_pt_synth_psb_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_psb raw;
>
> if (intel_pt_skip_event(pt))
> @@ -2009,7 +2009,7 @@ static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_mwait raw;
>
> if (intel_pt_skip_event(pt))
> @@ -2034,7 +2034,7 @@ static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_pwre raw;
>
> if (intel_pt_skip_event(pt))
> @@ -2059,7 +2059,7 @@ static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_exstop raw;
>
> if (intel_pt_skip_event(pt))
> @@ -2084,7 +2084,7 @@ static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_pwrx raw;
>
> if (intel_pt_skip_event(pt))
> @@ -2235,7 +2235,7 @@ static void intel_pt_add_lbrs(struct branch_stack *br_stack,
> static int intel_pt_do_synth_pebs_sample(struct intel_pt_queue *ptq, struct evsel *evsel, u64 id)
> {
> const struct intel_pt_blk_items *items = &ptq->state->items;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> union perf_event *event = ptq->event_buf;
> struct intel_pt *pt = ptq->pt;
> u64 sample_type = evsel->core.attr.sample_type;
> @@ -2407,7 +2407,7 @@ static int intel_pt_synth_events_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct {
> struct perf_synth_intel_evt cfe;
> struct perf_synth_intel_evd evd[INTEL_PT_MAX_EVDS];
> @@ -2446,7 +2446,7 @@ static int intel_pt_synth_iflag_chg_sample(struct intel_pt_queue *ptq)
> {
> struct intel_pt *pt = ptq->pt;
> union perf_event *event = ptq->event_buf;
> - struct perf_sample sample = { .ip = 0, };
> + struct perf_sample sample;
> struct perf_synth_intel_iflag_chg raw;
>
> if (intel_pt_skip_event(pt))
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
2025-01-13 8:15 ` Adrian Hunter
@ 2025-01-13 16:26 ` Tavian Barnes
2025-01-13 17:30 ` Ian Rogers
1 sibling, 0 replies; 5+ messages in thread
From: Tavian Barnes @ 2025-01-13 16:26 UTC (permalink / raw)
To: Adrian Hunter
Cc: linux-perf-users, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Ian Rogers, Liang, Kan,
Andrew Kreimer, linux-kernel
On Mon, Jan 13, 2025 at 3:15 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 11/01/25 19:56, Tavian Barnes wrote:
> > C designated initializers like
> >
> > struct perf_sample sample = { .ip = 0, };
> >
> > set every unmentioned field of the struct to zero. But since
> > sizeof(struct perf_sample) == 1384, this takes a long time.
> >
> > struct perf_sample does not need to be fully initialized, and even
>
> Yes it does need to be fully initialized. Leaving members
> uninitialized in the hope that they never get used adds to
> code complexity e.g. how do you know they never are used,
> or future members never will be used.
Right, please ignore this patch then. I thought I had seen other
cases where perf_sample was not fully initialized but I wasn't reading
the code carefully enough. Sorry for the noise!
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
2025-01-13 8:15 ` Adrian Hunter
2025-01-13 16:26 ` Tavian Barnes
@ 2025-01-13 17:30 ` Ian Rogers
2025-01-13 19:45 ` Ian Rogers
1 sibling, 1 reply; 5+ messages in thread
From: Ian Rogers @ 2025-01-13 17:30 UTC (permalink / raw)
To: Adrian Hunter
Cc: Tavian Barnes, linux-perf-users, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Liang, Kan, Andrew Kreimer,
linux-kernel
On Mon, Jan 13, 2025 at 12:15 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
>
> On 11/01/25 19:56, Tavian Barnes wrote:
> > C designated initializers like
> >
> > struct perf_sample sample = { .ip = 0, };
> >
> > set every unmentioned field of the struct to zero. But since
> > sizeof(struct perf_sample) == 1384, this takes a long time.
> >
> > struct perf_sample does not need to be fully initialized, and even
>
> Yes it does need to be fully initialized. Leaving members
> uninitialized in the hope that they never get used adds to
> code complexity e.g. how do you know they never are used,
> or future members never will be used.
First thought, perhaps we can use sanitizers to catch mistakes but
leave the default optimized code to not have any overhead.
Second thought, the perf_sample looks like:
```
$ pahole perf
...
struct perf_sample {
u64 ip; /* 0 8 */
u32 pid; /* 8 4 */
u32 tid; /* 12 4 */
u64 time; /* 16 8 */
u64 addr; /* 24 8 */
u64 id; /* 32 8 */
u64 stream_id; /* 40 8 */
u64 period; /* 48 8 */
u64 weight; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
u64 transaction; /* 64 8 */
u64 insn_cnt; /* 72 8 */
u64 cyc_cnt; /* 80 8 */
u32 cpu; /* 88 4 */
u32 raw_size; /* 92 4 */
u64 data_src; /* 96 8 */
u64 phys_addr; /* 104 8 */
u64 data_page_size; /* 112 8 */
u64 code_page_size; /* 120 8 */
/* --- cacheline 2 boundary (128 bytes) --- */
u64 cgroup; /* 128 8 */
u32 flags; /* 136 4 */
u32 machine_pid; /* 140 4 */
u32 vcpu; /* 144 4 */
u16 insn_len; /* 148 2 */
u8 cpumode; /* 150 1 */
/* XXX 1 byte hole, try to pack */
u16 misc; /* 152 2 */
u16 ins_lat; /* 154 2 */
union {
u16 p_stage_cyc; /* 156 2 */
u16 retire_lat; /* 156 2 */
}; /* 156 2 */
union {
u16 p_stage_cyc; /*
0 2 */
u16 retire_lat; /*
0 2 */
};
_Bool no_hw_idx; /* 158 1 */
char insn[16]; /* 159 16 */
/* XXX 1 byte hole, try to pack */
void * raw_data; /* 176 8 */
struct ip_callchain * callchain; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct branch_stack * branch_stack; /* 192 8 */
u64 * branch_stack_cntr; /* 200 8 */
struct regs_dump user_regs; /* 208 544 */
/* --- cacheline 11 boundary (704 bytes) was 48 bytes ago --- */
struct regs_dump intr_regs; /* 752 544 */
/* --- cacheline 20 boundary (1280 bytes) was 16 bytes ago --- */
struct stack_dump user_stack; /* 1296 24 */
struct sample_read read; /* 1320 40 */
/* --- cacheline 21 boundary (1344 bytes) was 16 bytes ago --- */
struct aux_sample aux_sample; /* 1360 16 */
struct simd_flags simd_flags; /* 1376 8 */
/* size: 1384, cachelines: 22, members: 39 */
/* sum members: 1382, holes: 2, sum holes: 2 */
/* last cacheline: 40 bytes */
};
```
The two regs_dump structs are 79% of the size. They comprise of:
```
struct regs_dump {
u64 abi; /* 0 8 */
u64 mask; /* 8 8 */
u64 * regs; /* 16 8 */
u64 cache_regs[64]; /* 24 512 */
/* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */
u64 cache_mask; /* 536 8 */
/* size: 544, cachelines: 9, members: 5 */
/* last cacheline: 32 bytes */
};
```
So the cache_regs are taking up the majority of the space. I have a
WIP patch set to make user_regs and intr_regs optional/lazily
allocated. I'll send it out shortly.
Thanks,
Ian
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH] perf intel-pt: don't zero the whole perf_sample
2025-01-13 17:30 ` Ian Rogers
@ 2025-01-13 19:45 ` Ian Rogers
0 siblings, 0 replies; 5+ messages in thread
From: Ian Rogers @ 2025-01-13 19:45 UTC (permalink / raw)
To: Adrian Hunter
Cc: Tavian Barnes, linux-perf-users, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Liang, Kan, Andrew Kreimer,
linux-kernel
On Mon, Jan 13, 2025 at 9:30 AM Ian Rogers <irogers@google.com> wrote:
>
> On Mon, Jan 13, 2025 at 12:15 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
> >
> > On 11/01/25 19:56, Tavian Barnes wrote:
> > > C designated initializers like
> > >
> > > struct perf_sample sample = { .ip = 0, };
> > >
> > > set every unmentioned field of the struct to zero. But since
> > > sizeof(struct perf_sample) == 1384, this takes a long time.
> > >
> > > struct perf_sample does not need to be fully initialized, and even
> >
> > Yes it does need to be fully initialized. Leaving members
> > uninitialized in the hope that they never get used adds to
> > code complexity e.g. how do you know they never are used,
> > or future members never will be used.
>
> First thought, perhaps we can use sanitizers to catch mistakes but
> leave the default optimized code to not have any overhead.
>
> Second thought, the perf_sample looks like:
> ```
> $ pahole perf
> ...
> struct perf_sample {
> u64 ip; /* 0 8 */
> u32 pid; /* 8 4 */
> u32 tid; /* 12 4 */
> u64 time; /* 16 8 */
> u64 addr; /* 24 8 */
> u64 id; /* 32 8 */
> u64 stream_id; /* 40 8 */
> u64 period; /* 48 8 */
> u64 weight; /* 56 8 */
> /* --- cacheline 1 boundary (64 bytes) --- */
> u64 transaction; /* 64 8 */
> u64 insn_cnt; /* 72 8 */
> u64 cyc_cnt; /* 80 8 */
> u32 cpu; /* 88 4 */
> u32 raw_size; /* 92 4 */
> u64 data_src; /* 96 8 */
> u64 phys_addr; /* 104 8 */
> u64 data_page_size; /* 112 8 */
> u64 code_page_size; /* 120 8 */
> /* --- cacheline 2 boundary (128 bytes) --- */
> u64 cgroup; /* 128 8 */
> u32 flags; /* 136 4 */
> u32 machine_pid; /* 140 4 */
> u32 vcpu; /* 144 4 */
> u16 insn_len; /* 148 2 */
> u8 cpumode; /* 150 1 */
>
> /* XXX 1 byte hole, try to pack */
>
> u16 misc; /* 152 2 */
> u16 ins_lat; /* 154 2 */
> union {
> u16 p_stage_cyc; /* 156 2 */
> u16 retire_lat; /* 156 2 */
> }; /* 156 2 */
> union {
> u16 p_stage_cyc; /*
> 0 2 */
> u16 retire_lat; /*
> 0 2 */
> };
>
> _Bool no_hw_idx; /* 158 1 */
> char insn[16]; /* 159 16 */
>
> /* XXX 1 byte hole, try to pack */
>
> void * raw_data; /* 176 8 */
> struct ip_callchain * callchain; /* 184 8 */
> /* --- cacheline 3 boundary (192 bytes) --- */
> struct branch_stack * branch_stack; /* 192 8 */
> u64 * branch_stack_cntr; /* 200 8 */
> struct regs_dump user_regs; /* 208 544 */
> /* --- cacheline 11 boundary (704 bytes) was 48 bytes ago --- */
> struct regs_dump intr_regs; /* 752 544 */
> /* --- cacheline 20 boundary (1280 bytes) was 16 bytes ago --- */
> struct stack_dump user_stack; /* 1296 24 */
> struct sample_read read; /* 1320 40 */
> /* --- cacheline 21 boundary (1344 bytes) was 16 bytes ago --- */
> struct aux_sample aux_sample; /* 1360 16 */
> struct simd_flags simd_flags; /* 1376 8 */
>
> /* size: 1384, cachelines: 22, members: 39 */
> /* sum members: 1382, holes: 2, sum holes: 2 */
> /* last cacheline: 40 bytes */
> };
> ```
> The two regs_dump structs are 79% of the size. They comprise of:
> ```
> struct regs_dump {
> u64 abi; /* 0 8 */
> u64 mask; /* 8 8 */
> u64 * regs; /* 16 8 */
> u64 cache_regs[64]; /* 24 512 */
> /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */
> u64 cache_mask; /* 536 8 */
>
> /* size: 544, cachelines: 9, members: 5 */
> /* last cacheline: 32 bytes */
> };
> ```
> So the cache_regs are taking up the majority of the space. I have a
> WIP patch set to make user_regs and intr_regs optional/lazily
> allocated. I'll send it out shortly.
Sent as:
https://lore.kernel.org/lkml/20250113194345.1537821-1-irogers@google.com/
Thanks,
Ian
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-01-13 19:45 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-11 17:56 [PATCH] perf intel-pt: don't zero the whole perf_sample Tavian Barnes
2025-01-13 8:15 ` Adrian Hunter
2025-01-13 16:26 ` Tavian Barnes
2025-01-13 17:30 ` Ian Rogers
2025-01-13 19:45 ` Ian Rogers
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox