* [PATCH v7 1/4] perf: Support PERF_SAMPLE_READ with inherit
2024-06-06 14:40 [PATCH v7 0/4] perf: Support PERF_SAMPLE_READ with inherit Ben Gainey
@ 2024-06-06 14:40 ` Ben Gainey
2024-06-07 9:32 ` Peter Zijlstra
2024-06-06 14:40 ` [PATCH v7 2/4] tools/perf: Track where perf_sample_ids need per-thread periods Ben Gainey
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Ben Gainey @ 2024-06-06 14:40 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung
Cc: james.clark, mark.rutland, alexander.shishkin, jolsa, irogers,
adrian.hunter, linux-perf-users, linux-kernel, Ben Gainey
This change allows events to use PERF_SAMPLE READ with inherit
so long as PERF_SAMPLE_TID is also set.
In this configuration, an event will be inherited into any
child processes / threads, allowing convenient profiling of a
multiprocess or multithreaded application, whilst allowing
profiling tools to collect per-thread samples, in particular
of groups of counters.
The read_format field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE
are changed by this new configuration, but calls to `read()` on the same
event file descriptor are unaffected and continue to return the
cumulative total.
Signed-off-by: Ben Gainey <ben.gainey@arm.com>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 78 ++++++++++++++++++++++++++++----------
2 files changed, 58 insertions(+), 21 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a5304ae8c654..810e0fe85572 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -935,6 +935,7 @@ struct perf_event_context {
int nr_task_data;
int nr_stat;
+ int nr_inherit_read;
int nr_freq;
int rotate_disable;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0128c5ff278..5c4f292bc6ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1767,6 +1767,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
event = rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
+/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -1797,6 +1805,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ ctx->nr_inherit_read++;
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
@@ -2021,6 +2031,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ ctx->nr_inherit_read--;
list_del_rcu(&event->event_entry);
@@ -3532,11 +3544,18 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_pending is stable. */
- if (local_read(&ctx->nr_pending) ||
+ if (ctx->nr_inherit_read ||
+ next_ctx->nr_inherit_read ||
+ local_read(&ctx->nr_pending) ||
local_read(&next_ctx->nr_pending)) {
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
@@ -4552,11 +4571,19 @@ static void __perf_event_read(void *info)
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count_cumulative(struct perf_event *event)
{
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static inline u64 perf_event_count(struct perf_event *event, bool self_value_only)
+{
+ if (self_value_only && has_inherit_and_sample_read(&event->attr))
+ return local64_read(&event->count);
+
+ return perf_event_count_cumulative(event);
+}
+
static void calc_timer_values(struct perf_event *event,
u64 *now,
u64 *enabled,
@@ -5473,7 +5500,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
- total += perf_event_count(event);
+ total += perf_event_count_cumulative(event);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -5482,7 +5509,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
- total += perf_event_count(child);
+ total += perf_event_count_cumulative(child);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -5564,14 +5591,14 @@ static int __perf_read_group_add(struct perf_event *leader,
/*
* Write {count,id} tuples for every sibling.
*/
- values[n++] += perf_event_count(leader);
+ values[n++] += perf_event_count_cumulative(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
- values[n++] += perf_event_count(sub);
+ values[n++] += perf_event_count_cumulative(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
@@ -6151,7 +6178,7 @@ void perf_event_update_userpage(struct perf_event *event)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count_cumulative(event);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
@@ -7205,13 +7232,14 @@ void perf_event__output_id_sample(struct perf_event *event,
static void perf_output_read_one(struct perf_output_handle *handle,
struct perf_event *event,
- u64 enabled, u64 running)
+ u64 enabled, u64 running,
+ bool from_sample)
{
u64 read_format = event->attr.read_format;
u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, from_sample);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -7229,8 +7257,9 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running,
+ bool from_sample)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
@@ -7256,7 +7285,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, from_sample);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
@@ -7271,7 +7300,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, from_sample);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
@@ -7292,9 +7321,14 @@ static void perf_output_read_group(struct perf_output_handle *handle,
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
- struct perf_event *event)
+ struct perf_event *event,
+ bool from_sample)
{
u64 enabled = 0, running = 0, now;
u64 read_format = event->attr.read_format;
@@ -7312,9 +7346,9 @@ static void perf_output_read(struct perf_output_handle *handle,
calc_timer_values(event, &now, &enabled, &running);
if (event->attr.read_format & PERF_FORMAT_GROUP)
- perf_output_read_group(handle, event, enabled, running);
+ perf_output_read_group(handle, event, enabled, running, from_sample);
else
- perf_output_read_one(handle, event, enabled, running);
+ perf_output_read_one(handle, event, enabled, running, from_sample);
}
void perf_output_sample(struct perf_output_handle *handle,
@@ -7354,7 +7388,7 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, data->period);
if (sample_type & PERF_SAMPLE_READ)
- perf_output_read(handle, event);
+ perf_output_read(handle, event, true);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
@@ -7955,7 +7989,7 @@ perf_event_read_event(struct perf_event *event,
return;
perf_output_put(&handle, read_event);
- perf_output_read(&handle, event);
+ perf_output_read(&handle, event, false);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
@@ -12021,10 +12055,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
* See perf_output_read().
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
goto err_ns;
if (!has_branch_stack(event))
@@ -13048,7 +13084,7 @@ static void sync_child_event(struct perf_event *child_event)
perf_event_read_event(child_event, task);
}
- child_val = perf_event_count(child_event);
+ child_val = perf_event_count_cumulative(child_event);
/*
* Add back the child's count to the parent's count:
--
2.45.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v7 1/4] perf: Support PERF_SAMPLE_READ with inherit
2024-06-06 14:40 ` [PATCH v7 1/4] " Ben Gainey
@ 2024-06-07 9:32 ` Peter Zijlstra
2024-06-07 10:16 ` Ben Gainey
0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2024-06-07 9:32 UTC (permalink / raw)
To: Ben Gainey
Cc: mingo, acme, namhyung, james.clark, mark.rutland,
alexander.shishkin, jolsa, irogers, adrian.hunter,
linux-perf-users, linux-kernel
On Thu, Jun 06, 2024 at 03:40:56PM +0100, Ben Gainey wrote:
> This change allows events to use PERF_SAMPLE READ with inherit
> so long as PERF_SAMPLE_TID is also set.
>
> In this configuration, an event will be inherited into any
> child processes / threads, allowing convenient profiling of a
> multiprocess or multithreaded application, whilst allowing
> profiling tools to collect per-thread samples, in particular
> of groups of counters.
Perhaps a few words on *WHY* this is important.
> The read_format field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE
> are changed by this new configuration, but calls to `read()` on the same
> event file descriptor are unaffected and continue to return the
> cumulative total.
This is unfortunate. Up to this point they were the same. Also, I see no
change to the uapi file. So were you trying to say that only
read_format::value is changed to be the thread local value as opposed to
the hierarchy total?
Please fix the wording to be unambiguous as to what is actually meant.
Also try and justify why it is okay to break this symmetry.
> @@ -3532,11 +3544,18 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
> perf_ctx_disable(ctx, false);
>
> /* PMIs are disabled; ctx->nr_pending is stable. */
> - if (local_read(&ctx->nr_pending) ||
> + if (ctx->nr_inherit_read ||
> + next_ctx->nr_inherit_read ||
> + local_read(&ctx->nr_pending) ||
> local_read(&next_ctx->nr_pending)) {
This seems unfortunate, nr_pending and nr_inherit_read are both used
exclusively to inhibit this context switch optimization. Surely they can
share the exact same counter.
That is, rename nr_pending and use it for both?
> /*
> * Must not swap out ctx when there's pending
> * events that rely on the ctx->task relation.
> + *
> + * Likewise, when a context contains inherit +
> + * SAMPLE_READ events they should be switched
> + * out using the slow path so that they are
> + * treated as if they were distinct contexts.
> */
> raw_spin_unlock(&next_ctx->lock);
> rcu_read_unlock();
> @@ -4552,11 +4571,19 @@ static void __perf_event_read(void *info)
> raw_spin_unlock(&ctx->lock);
> }
>
> -static inline u64 perf_event_count(struct perf_event *event)
> +static inline u64 perf_event_count_cumulative(struct perf_event *event)
I don't think you need this -- overly long and hard to type function
name...
> {
> return local64_read(&event->count) + atomic64_read(&event->child_count);
> }
>
> +static inline u64 perf_event_count(struct perf_event *event, bool self_value_only)
> +{
> + if (self_value_only && has_inherit_and_sample_read(&event->attr))
> + return local64_read(&event->count);
... if this @self_value_only argument was actually used as such -- it
isn't, see how you use 'from_sample' which is something else entirely.
Which then also caused to you fix it up and make a mess with that &&
has_inherit_and_sample_read() nonsense. (also, shorter function names,
more good)
> +
> + return perf_event_count_cumulative(event);
> +}
That is, I would really rather you had:
static inline u64 perf_event_count(struct perf_event *event, bool self)
{
if (self)
return local64_read(&event->count);
return local64_read(&event->count) + local64_read(&event->child_count);
}
And then actually use that argument as intended.
> @@ -7205,13 +7232,14 @@ void perf_event__output_id_sample(struct perf_event *event,
>
> static void perf_output_read_one(struct perf_output_handle *handle,
> struct perf_event *event,
> - u64 enabled, u64 running)
> + u64 enabled, u64 running,
> + bool from_sample)
> {
> u64 read_format = event->attr.read_format;
> u64 values[5];
> int n = 0;
>
> - values[n++] = perf_event_count(event);
> + values[n++] = perf_event_count(event, from_sample);
...observe the fail... from_sample != self-value-only
> if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> values[n++] = enabled +
> atomic64_read(&event->child_total_time_enabled);
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v7 1/4] perf: Support PERF_SAMPLE_READ with inherit
2024-06-07 9:32 ` Peter Zijlstra
@ 2024-06-07 10:16 ` Ben Gainey
2024-06-07 11:02 ` Peter Zijlstra
0 siblings, 1 reply; 10+ messages in thread
From: Ben Gainey @ 2024-06-07 10:16 UTC (permalink / raw)
To: peterz@infradead.org
Cc: alexander.shishkin@linux.intel.com, linux-kernel@vger.kernel.org,
Mark Rutland, acme@kernel.org, mingo@redhat.com, James Clark,
adrian.hunter@intel.com, namhyung@kernel.org, irogers@google.com,
jolsa@kernel.org, linux-perf-users@vger.kernel.org
On Fri, 2024-06-07 at 11:32 +0200, Peter Zijlstra wrote:
> On Thu, Jun 06, 2024 at 03:40:56PM +0100, Ben Gainey wrote:
> > This change allows events to use PERF_SAMPLE READ with inherit
> > so long as PERF_SAMPLE_TID is also set.
> >
> > In this configuration, an event will be inherited into any
> > child processes / threads, allowing convenient profiling of a
> > multiprocess or multithreaded application, whilst allowing
> > profiling tools to collect per-thread samples, in particular
> > of groups of counters.
>
> Perhaps a few words on *WHY* this is important.
>
> > The read_format field of both PERF_RECORD_READ and
> > PERF_RECORD_SAMPLE
> > are changed by this new configuration, but calls to `read()` on the
> > same
> > event file descriptor are unaffected and continue to return the
> > cumulative total.
>
> This is unfortunate. Up to this point they were the same. Also, I see
> no
> change to the uapi file. So were you trying to say that only
> read_format::value is changed to be the thread local value as opposed
> to
> the hierarchy total?
>
> Please fix the wording to be unambiguous as to what is actually
> meant.
> Also try and justify why it is okay to break this symmetry.
Yes, the meaning of the PERF_RECORD_SAMPLE's read_format value changes
in this specific scenario to be a thread-local value rather than the
total.
I'll update and add some justification.
>
> > @@ -3532,11 +3544,18 @@ perf_event_context_sched_out(struct
> > task_struct *task, struct task_struct *next)
> > perf_ctx_disable(ctx, false);
> >
> > /* PMIs are disabled; ctx->nr_pending is stable. */
> > - if (local_read(&ctx->nr_pending) ||
> > + if (ctx->nr_inherit_read ||
> > + next_ctx->nr_inherit_read ||
> > + local_read(&ctx->nr_pending) ||
> > local_read(&next_ctx->nr_pending)) {
>
> This seems unfortunate, nr_pending and nr_inherit_read are both used
> exclusively to inhibit this context switch optimization. Surely they
> can
> share the exact same counter.
>
> That is, rename nr_pending and use it for both?
Sure, how about "nr_no_switch_fast" ?
>
> > /*
> > * Must not swap out ctx when there's pending
> > * events that rely on the ctx->task relation.
> > + *
> > + * Likewise, when a context contains inherit +
> > + * SAMPLE_READ events they should be switched
> > + * out using the slow path so that they are
> > + * treated as if they were distinct contexts.
> > */
> > raw_spin_unlock(&next_ctx->lock);
> > rcu_read_unlock();
> > @@ -4552,11 +4571,19 @@ static void __perf_event_read(void *info)
> > raw_spin_unlock(&ctx->lock);
> > }
> >
> > -static inline u64 perf_event_count(struct perf_event *event)
> > +static inline u64 perf_event_count_cumulative(struct perf_event
> > *event)
>
> I don't think you need this -- overly long and hard to type function
> name...
Sure, presumably you are happy with just calling
"perf_event_count(event, false)" everywhere it is currently used,
rather than renaming it to something shorter and keeping the two
functions?
>
> > {
> > return local64_read(&event->count) + atomic64_read(&event-
> > >child_count);
> > }
> >
> > +static inline u64 perf_event_count(struct perf_event *event, bool
> > self_value_only)
> > +{
> > + if (self_value_only && has_inherit_and_sample_read(&event->attr))
> > + return local64_read(&event->count);
>
> ... if this @self_value_only argument was actually used as such -- it
> isn't, see how you use 'from_sample' which is something else
> entirely.
> Which then also caused to you fix it up and make a mess with that &&
> has_inherit_and_sample_read() nonsense. (also, shorter function
> names,
> more good)
>
> > +
> > + return perf_event_count_cumulative(event);
> > +}
>
> That is, I would really rather you had:
>
> static inline u64 perf_event_count(struct perf_event *event, bool
> self)
> {
> if (self)
> return local64_read(&event->count);
>
> return local64_read(&event->count) + local64_read(&event-
> >child_count);
> }
>
> And then actually use that argument as intended.
Fair point.
I was trying to avoid the 3 subsequent uses all having to repeat
"from_sample && has_inherit_and_sample_read(&event->attr)", which feels
a bit of a pit-trappy.
I suppose I could pull that into a "use_self_value(from_sample,event)"?
>
> > @@ -7205,13 +7232,14 @@ void perf_event__output_id_sample(struct
> > perf_event *event,
> >
> > static void perf_output_read_one(struct perf_output_handle
> > *handle,
> > struct perf_event *event,
> > - u64 enabled, u64 running)
> > + u64 enabled, u64 running,
> > + bool from_sample)
> > {
> > u64 read_format = event->attr.read_format;
> > u64 values[5];
> > int n = 0;
> >
> > - values[n++] = perf_event_count(event);
> > + values[n++] = perf_event_count(event, from_sample);
>
> ...observe the fail... from_sample != self-value-only
By fail you are referring to the difference in names?
>
> > if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> > values[n++] = enabled +
> > atomic64_read(&event->child_total_time_enabled);
>
Thanks
Ben
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v7 1/4] perf: Support PERF_SAMPLE_READ with inherit
2024-06-07 10:16 ` Ben Gainey
@ 2024-06-07 11:02 ` Peter Zijlstra
2024-06-07 14:04 ` Ben Gainey
0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2024-06-07 11:02 UTC (permalink / raw)
To: Ben Gainey
Cc: alexander.shishkin@linux.intel.com, linux-kernel@vger.kernel.org,
Mark Rutland, acme@kernel.org, mingo@redhat.com, James Clark,
adrian.hunter@intel.com, namhyung@kernel.org, irogers@google.com,
jolsa@kernel.org, linux-perf-users@vger.kernel.org
On Fri, Jun 07, 2024 at 10:16:15AM +0000, Ben Gainey wrote:
> > > @@ -3532,11 +3544,18 @@ perf_event_context_sched_out(struct
> > > task_struct *task, struct task_struct *next)
> > > perf_ctx_disable(ctx, false);
> > >
> > > /* PMIs are disabled; ctx->nr_pending is stable. */
> > > - if (local_read(&ctx->nr_pending) ||
> > > + if (ctx->nr_inherit_read ||
> > > + next_ctx->nr_inherit_read ||
> > > + local_read(&ctx->nr_pending) ||
> > > local_read(&next_ctx->nr_pending)) {
> >
> > This seems unfortunate, nr_pending and nr_inherit_read are both used
> > exclusively to inhibit this context switch optimization. Surely they
> > can
> > share the exact same counter.
> >
> > That is, rename nr_pending and use it for both?
>
> Sure, how about "nr_no_switch_fast" ?
Yeah, I suppose.
> Sure, presumably you are happy with just calling
> "perf_event_count(event, false)" everywhere it is currently used,
> rather than renaming it to something shorter and keeping the two
> functions?
Yeah, there aren't *that* many instances. Your current patch already
touches them all anyway.
> > That is, I would really rather you had:
> >
> > static inline u64 perf_event_count(struct perf_event *event, bool
> > self)
> > {
> > if (self)
> > return local64_read(&event->count);
> >
> > return local64_read(&event->count) + local64_read(&event-
> > >child_count);
> > }
> >
> > And then actually use that argument as intended.
>
>
> Fair point.
>
> I was trying to avoid the 3 subsequent uses all having to repeat
> "from_sample && has_inherit_and_sample_read(&event->attr)", which feels
> a bit of a pit-trappy.
>
> I suppose I could pull that into a "use_self_value(from_sample,event)"?
IIRC they all originate in a single location around perf_output_read(),
that already has the event and could easily 'correct' the semantic
meaning by doing the above once or so.
> >
> > > @@ -7205,13 +7232,14 @@ void perf_event__output_id_sample(struct
> > > perf_event *event,
> > >
> > > static void perf_output_read_one(struct perf_output_handle
> > > *handle,
> > > struct perf_event *event,
> > > - u64 enabled, u64 running)
> > > + u64 enabled, u64 running,
> > > + bool from_sample)
> > > {
> > > u64 read_format = event->attr.read_format;
> > > u64 values[5];
> > > int n = 0;
> > >
> > > - values[n++] = perf_event_count(event);
> > > + values[n++] = perf_event_count(event, from_sample);
> >
> > ...observe the fail... from_sample != self-value-only
>
> By fail you are referring to the difference in names?
The difference in meaning, one is from-sample, the other is self-value.
Per the extra condition squirrelled away they are not equivalent.
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v7 1/4] perf: Support PERF_SAMPLE_READ with inherit
2024-06-07 11:02 ` Peter Zijlstra
@ 2024-06-07 14:04 ` Ben Gainey
2024-06-10 19:01 ` Peter Zijlstra
0 siblings, 1 reply; 10+ messages in thread
From: Ben Gainey @ 2024-06-07 14:04 UTC (permalink / raw)
To: peterz@infradead.org
Cc: alexander.shishkin@linux.intel.com,
linux-perf-users@vger.kernel.org, Mark Rutland, acme@kernel.org,
mingo@redhat.com, James Clark, adrian.hunter@intel.com,
namhyung@kernel.org, irogers@google.com, jolsa@kernel.org,
linux-kernel@vger.kernel.org
On Fri, 2024-06-07 at 13:02 +0200, Peter Zijlstra wrote:
> On Fri, Jun 07, 2024 at 10:16:15AM +0000, Ben Gainey wrote:
>
> > > > @@ -3532,11 +3544,18 @@ perf_event_context_sched_out(struct
> > > > task_struct *task, struct task_struct *next)
> > > > � perf_ctx_disable(ctx, false);
> > > > �
> > > > � /* PMIs are disabled; ctx->nr_pending is stable. */
> > > > - if (local_read(&ctx->nr_pending) ||
> > > > + if (ctx->nr_inherit_read ||
> > > > + ��� next_ctx->nr_inherit_read ||
> > > > + ��� local_read(&ctx->nr_pending) ||
> > > > � ��� local_read(&next_ctx->nr_pending)) {
> > >
> > > This seems unfortunate, nr_pending and nr_inherit_read are both
> > > used
> > > exclusively to inhibit this context switch optimization. Surely
> > > they
> > > can
> > > share the exact same counter.
> > >
> > > That is, rename nr_pending and use it for both?
> >
> > Sure, how about "nr_no_switch_fast" ?
>
> Yeah, I suppose.
>
>
> > Sure, presumably you are happy with just calling
> > "perf_event_count(event, false)" everywhere it is currently used,
> > rather than renaming it to something shorter and keeping the two
> > functions?
>
> Yeah, there aren't *that* many instances. Your current patch already
> touches them all anyway.
>
> > > That is, I would really rather you had:
> > >
> > > static inline u64 perf_event_count(struct perf_event *event, bool
> > > self)
> > > {
> > > �if (self)
> > > �return local64_read(&event->count);
> > >
> > > �return local64_read(&event->count) + local64_read(&event-
> > > > child_count);
> > > }
> > >
> > > And then actually use that argument as intended.
> >
> >
> > Fair point.
> >
> > I was trying to avoid the 3 subsequent uses all having to repeat
> > "from_sample && has_inherit_and_sample_read(&event->attr)", which
> > feels
> > a bit of a pit-trappy.�
> >
> > I suppose I could pull that into a
> > "use_self_value(from_sample,event)"?
>
> IIRC they all originate in a single location around
> perf_output_read(),
> that already has the event and could easily 'correct' the semantic
> meaning by doing the above once or so.
>
As far as I can tell, you can mix events in a group with inconsistent
values of PERF_SAMPLE_READ which means that doing it at the top level
introduces an inconsistency/confusing behaviour since it makes the
"thread-local-ness" of the read_format values a property of the event
that caused the sample, rather than of the specific event to which the
value belongs. The current implementation makes it a property of the
specific event not the sample event. Specifically, when
perf_output_read_group reads a child event that does not have
PERF_SAMPLE_READ, the sample event must have PERF_SAMPLE_READ, meaning
the child event will give the thread-local value even though it was not
created as inherit+PERF_SAMPLE_READ
I can either:
* Keep it so that the perf_output_read_group uses per-event value for
self
* Rework the deliver_sample_value in session.c to base its decision on
the sample event rather than the specific event
* Forbid inconsistent PERF_SAMPLE_READ for events in a group
* Something else?
> > >
> > > > @@ -7205,13 +7232,14 @@ void
> > > > perf_event__output_id_sample(struct
> > > > perf_event *event,
> > > > �
> > > > �static void perf_output_read_one(struct perf_output_handle
> > > > *handle,
> > > > � struct perf_event *event,
> > > > - u64 enabled, u64 running)
> > > > + u64 enabled, u64 running,
> > > > + bool from_sample)
> > > > �{
> > > > � u64 read_format = event->attr.read_format;
> > > > � u64 values[5];
> > > > � int n = 0;
> > > > �
> > > > - values[n++] = perf_event_count(event);
> > > > + values[n++] = perf_event_count(event, from_sample);
> > >
> > > ...observe the fail... from_sample != self-value-only
> >
> > By fail you are referring to the difference in names?
>
> The difference in meaning, one is from-sample, the other is self-
> value.
> Per the extra condition squirrelled away they are not equivalent.
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v7 1/4] perf: Support PERF_SAMPLE_READ with inherit
2024-06-07 14:04 ` Ben Gainey
@ 2024-06-10 19:01 ` Peter Zijlstra
0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-06-10 19:01 UTC (permalink / raw)
To: Ben Gainey
Cc: alexander.shishkin@linux.intel.com,
linux-perf-users@vger.kernel.org, Mark Rutland, acme@kernel.org,
mingo@redhat.com, James Clark, adrian.hunter@intel.com,
namhyung@kernel.org, irogers@google.com, jolsa@kernel.org,
linux-kernel@vger.kernel.org
On Fri, Jun 07, 2024 at 02:04:49PM +0000, Ben Gainey wrote:
> > IIRC they all originate in a single location around
> > perf_output_read(),
> > that already has the event and could easily 'correct' the semantic
> > meaning by doing the above once or so.
> >
>
> As far as I can tell, you can mix events in a group with inconsistent
> values of PERF_SAMPLE_READ which means that doing it at the top level
> introduces an inconsistency/confusing behaviour since it makes the
> "thread-local-ness" of the read_format values a property of the event
> that caused the sample, rather than of the specific event to which the
> value belongs. The current implementation makes it a property of the
> specific event not the sample event. Specifically, when
> perf_output_read_group reads a child event that does not have
> PERF_SAMPLE_READ, the sample event must have PERF_SAMPLE_READ, meaning
> the child event will give the thread-local value even though it was not
> created as inherit+PERF_SAMPLE_READ
>
> I can either:
>
> * Keep it so that the perf_output_read_group uses per-event value for
> self
Having the lot be inconsistent seems bad.
> * Rework the deliver_sample_value in session.c to base its decision on
> the sample event rather than the specific event
The code as it exists seems to use the read_format from the event that
actually triggers the sample -- which makes sense. I would expect this
new thing to follow that.
Mixing inherit between events in a group gets you crap, but that's what
you asked for I suppose :-) But otherwise let the sampling event set the
format.
> * Forbid inconsistent PERF_SAMPLE_READ for events in a group
Not possible I think, you can have non-sampling events in a group.
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v7 2/4] tools/perf: Track where perf_sample_ids need per-thread periods
2024-06-06 14:40 [PATCH v7 0/4] perf: Support PERF_SAMPLE_READ with inherit Ben Gainey
2024-06-06 14:40 ` [PATCH v7 1/4] " Ben Gainey
@ 2024-06-06 14:40 ` Ben Gainey
2024-06-06 14:40 ` [PATCH v7 3/4] tools/perf: Correctly calculate sample period for inherited SAMPLE_READ values Ben Gainey
2024-06-06 14:40 ` [PATCH v7 4/4] tools/perf: Allow inherit + PERF_SAMPLE_READ when opening events Ben Gainey
3 siblings, 0 replies; 10+ messages in thread
From: Ben Gainey @ 2024-06-06 14:40 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung
Cc: james.clark, mark.rutland, alexander.shishkin, jolsa, irogers,
adrian.hunter, linux-perf-users, linux-kernel, Ben Gainey
perf_sample_ids and related evlist/evsel code are modified to track
which events combine inherit+PERF_SAMPLE_READ+PERF_SAMPLE_TID.
Events with this combination of properties must be handled differently
when calculating each sample period. They must use the combination
of (ID + TID) to uniquely identify each distinct sequence of values.
Signed-off-by: Ben Gainey <ben.gainey@arm.com>
---
tools/lib/perf/evlist.c | 1 +
tools/lib/perf/evsel.c | 7 +++++++
tools/lib/perf/include/internal/evsel.h | 9 +++++++++
3 files changed, 17 insertions(+)
diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index c6d67fc9e57e..d17288eeaee4 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -255,6 +255,7 @@ static void perf_evlist__id_hash(struct perf_evlist *evlist,
sid->id = id;
sid->evsel = evsel;
+ sid->period_per_thread = perf_evsel__attr_has_per_thread_sample_period(evsel);
hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
hlist_add_head(&sid->node, &evlist->heads[hash]);
}
diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index c07160953224..f7abb879f416 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -537,6 +537,13 @@ void perf_evsel__free_id(struct perf_evsel *evsel)
evsel->ids = 0;
}
+bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evsel)
+{
+ return (evsel->attr.sample_type & PERF_SAMPLE_READ)
+ && (evsel->attr.sample_type & PERF_SAMPLE_TID)
+ && evsel->attr.inherit;
+}
+
void perf_counts_values__scale(struct perf_counts_values *count,
bool scale, __s8 *pscaled)
{
diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h
index 5cd220a61962..f8de2bf89c76 100644
--- a/tools/lib/perf/include/internal/evsel.h
+++ b/tools/lib/perf/include/internal/evsel.h
@@ -36,6 +36,13 @@ struct perf_sample_id {
/* Holds total ID period value for PERF_SAMPLE_READ processing. */
u64 period;
+
+ /*
+ * When inherit is combined with PERF_SAMPLE_READ, the period value is
+ * per (id, thread) tuple, rather than per id, so use the stream_id to
+ * uniquely identify the period, rather than the id.
+ */
+ bool period_per_thread;
};
struct perf_evsel {
@@ -88,4 +95,6 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter);
int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
void perf_evsel__free_id(struct perf_evsel *evsel);
+bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evsel);
+
#endif /* __LIBPERF_INTERNAL_EVSEL_H */
--
2.45.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v7 3/4] tools/perf: Correctly calculate sample period for inherited SAMPLE_READ values
2024-06-06 14:40 [PATCH v7 0/4] perf: Support PERF_SAMPLE_READ with inherit Ben Gainey
2024-06-06 14:40 ` [PATCH v7 1/4] " Ben Gainey
2024-06-06 14:40 ` [PATCH v7 2/4] tools/perf: Track where perf_sample_ids need per-thread periods Ben Gainey
@ 2024-06-06 14:40 ` Ben Gainey
2024-06-06 14:40 ` [PATCH v7 4/4] tools/perf: Allow inherit + PERF_SAMPLE_READ when opening events Ben Gainey
3 siblings, 0 replies; 10+ messages in thread
From: Ben Gainey @ 2024-06-06 14:40 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung
Cc: james.clark, mark.rutland, alexander.shishkin, jolsa, irogers,
adrian.hunter, linux-perf-users, linux-kernel, Ben Gainey
Sample period calculation is updated to take into account the fact that
events with inherit+PERF_SAMPLE_READ+PERF_SAMPLE_TID should use
the TID in combination with the ID field in the read_format data to
identify which value represents the previous accumulated counter total used to
calculate the period delta since the last sample.
perf_sample_id is modified to support tracking per-thread
values, along with the existing global per-id values. In the
per-thread case, values are stored in a hash by TID within the
perf_sample_id, and are dynamically allocated as the number is not known
ahead of time.
deliver_sample_value is modified to correctly locate the previous
sample storage based on the attribute, stream id and thread id.
Signed-off-by: Ben Gainey <ben.gainey@arm.com>
---
tools/lib/perf/evsel.c | 41 ++++++++++++++++++++++
tools/lib/perf/include/internal/evsel.h | 45 +++++++++++++++++++++++--
tools/perf/util/session.c | 11 ++++--
3 files changed, 92 insertions(+), 5 deletions(-)
diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index f7abb879f416..26f3d7ba0f26 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -5,6 +5,7 @@
#include <perf/evsel.h>
#include <perf/cpumap.h>
#include <perf/threadmap.h>
+#include <linux/hash.h>
#include <linux/list.h>
#include <internal/evsel.h>
#include <linux/zalloc.h>
@@ -23,6 +24,7 @@ void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr,
int idx)
{
INIT_LIST_HEAD(&evsel->node);
+ INIT_LIST_HEAD(&evsel->per_stream_periods);
evsel->attr = *attr;
evsel->idx = idx;
evsel->leader = evsel;
@@ -531,10 +533,17 @@ int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
void perf_evsel__free_id(struct perf_evsel *evsel)
{
+ struct perf_sample_id_period *pos, *n;
+
xyarray__delete(evsel->sample_id);
evsel->sample_id = NULL;
zfree(&evsel->id);
evsel->ids = 0;
+
+ perf_evsel_for_each_per_thread_period_safe(evsel, n, pos) {
+ list_del_init(&pos->node);
+ free(pos);
+ }
}
bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evsel)
@@ -544,6 +553,38 @@ bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evsel)
&& evsel->attr.inherit;
}
+u64 *perf_sample_id__get_period_storage(struct perf_sample_id *sid, u32 tid)
+{
+ struct hlist_head *head;
+ struct perf_sample_id_period *res;
+ int hash;
+
+ if (!sid->period_per_thread)
+ return &sid->period;
+
+ hash = hash_32(tid, PERF_SAMPLE_ID__HLIST_BITS);
+ head = &sid->periods[hash];
+
+ hlist_for_each_entry(res, head, hnode)
+ if (res->tid == tid)
+ return &res->period;
+
+ if (sid->evsel == NULL)
+ return NULL;
+
+ res = zalloc(sizeof(struct perf_sample_id_period));
+ if (res == NULL)
+ return NULL;
+
+ INIT_LIST_HEAD(&res->node);
+ res->tid = tid;
+
+ list_add_tail(&res->node, &sid->evsel->per_stream_periods);
+ hlist_add_head(&res->hnode, &sid->periods[hash]);
+
+ return &res->period;
+}
+
void perf_counts_values__scale(struct perf_counts_values *count,
bool scale, __s8 *pscaled)
{
diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h
index f8de2bf89c76..797dc9d78254 100644
--- a/tools/lib/perf/include/internal/evsel.h
+++ b/tools/lib/perf/include/internal/evsel.h
@@ -11,6 +11,32 @@
struct perf_thread_map;
struct xyarray;
+/**
+ * The per-thread accumulated period storage node.
+ */
+struct perf_sample_id_period {
+ struct list_head node;
+ struct hlist_node hnode;
+ /* Holds total ID period value for PERF_SAMPLE_READ processing. */
+ u64 period;
+ /* The TID that the values belongs to */
+ u32 tid;
+};
+
+/**
+ * perf_evsel_for_each_per_thread_period_safe - safely iterate thru all the
+ * per_stream_periods
+ * @evlist:perf_evsel instance to iterate
+ * @item: struct perf_sample_id_period iterator
+ * @tmp: struct perf_sample_id_period temp iterator
+ */
+#define perf_evsel_for_each_per_thread_period_safe(evsel, tmp, item) \
+ list_for_each_entry_safe(item, tmp, &(evsel)->per_stream_periods, node)
+
+
+#define PERF_SAMPLE_ID__HLIST_BITS 4
+#define PERF_SAMPLE_ID__HLIST_SIZE (1 << PERF_SAMPLE_ID__HLIST_BITS)
+
/*
* Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
* more than one entry in the evlist.
@@ -34,8 +60,18 @@ struct perf_sample_id {
pid_t machine_pid;
struct perf_cpu vcpu;
- /* Holds total ID period value for PERF_SAMPLE_READ processing. */
- u64 period;
+ union {
+ /*
+ * Holds total ID period value for PERF_SAMPLE_READ processing
+ * (when period is not per-thread).
+ */
+ u64 period;
+ /*
+ * Holds total ID period value for PERF_SAMPLE_READ processing
+ * (when period is per-thread).
+ */
+ struct hlist_head periods[PERF_SAMPLE_ID__HLIST_SIZE];
+ };
/*
* When inherit is combined with PERF_SAMPLE_READ, the period value is
@@ -65,6 +101,9 @@ struct perf_evsel {
u32 ids;
struct perf_evsel *leader;
+ /* Where period_per_thread is true, stores the per-thread values */
+ struct list_head per_stream_periods;
+
/* parse modifier helper */
int nr_members;
/*
@@ -97,4 +136,6 @@ void perf_evsel__free_id(struct perf_evsel *evsel);
bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evsel);
+u64 *perf_sample_id__get_period_storage(struct perf_sample_id *sid, u32 tid);
+
#endif /* __LIBPERF_INTERNAL_EVSEL_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a10343b9dcd4..cf5dbe075674 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1478,14 +1478,19 @@ static int deliver_sample_value(struct evlist *evlist,
{
struct perf_sample_id *sid = evlist__id2sid(evlist, v->id);
struct evsel *evsel;
+ u64 *storage = NULL;
if (sid) {
+ storage = perf_sample_id__get_period_storage(sid, sample->tid);
+ }
+
+ if (storage) {
sample->id = v->id;
- sample->period = v->value - sid->period;
- sid->period = v->value;
+ sample->period = v->value - *storage;
+ *storage = v->value;
}
- if (!sid || sid->evsel == NULL) {
+ if (!storage || sid->evsel == NULL) {
++evlist->stats.nr_unknown_id;
return 0;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v7 4/4] tools/perf: Allow inherit + PERF_SAMPLE_READ when opening events
2024-06-06 14:40 [PATCH v7 0/4] perf: Support PERF_SAMPLE_READ with inherit Ben Gainey
` (2 preceding siblings ...)
2024-06-06 14:40 ` [PATCH v7 3/4] tools/perf: Correctly calculate sample period for inherited SAMPLE_READ values Ben Gainey
@ 2024-06-06 14:40 ` Ben Gainey
3 siblings, 0 replies; 10+ messages in thread
From: Ben Gainey @ 2024-06-06 14:40 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung
Cc: james.clark, mark.rutland, alexander.shishkin, jolsa, irogers,
adrian.hunter, linux-perf-users, linux-kernel, Ben Gainey
The tool will now default to this new mode if the user specifies a
sampling group when not in system-wide mode, and when --no-inherit
is not specified.
This change updates evsel to allow the combination of inherit
and PERF_SAMPLE_READ.
A fallback is implemented for kernel versions where this feature is not
supported.
Signed-off-by: Ben Gainey <ben.gainey@arm.com>
---
tools/perf/tests/attr/README | 2 +
.../tests/attr/test-record-group-sampling | 3 +-
.../tests/attr/test-record-group-sampling1 | 51 ++++++++++++++++
.../tests/attr/test-record-group-sampling2 | 61 +++++++++++++++++++
tools/perf/tests/attr/test-record-group2 | 1 +
...{test-record-group2 => test-record-group3} | 10 +--
tools/perf/util/evsel.c | 19 +++++-
tools/perf/util/evsel.h | 1 +
8 files changed, 141 insertions(+), 7 deletions(-)
create mode 100644 tools/perf/tests/attr/test-record-group-sampling1
create mode 100644 tools/perf/tests/attr/test-record-group-sampling2
copy tools/perf/tests/attr/{test-record-group2 => test-record-group3} (81%)
diff --git a/tools/perf/tests/attr/README b/tools/perf/tests/attr/README
index 4066fec7180a..67c4ca76b85d 100644
--- a/tools/perf/tests/attr/README
+++ b/tools/perf/tests/attr/README
@@ -51,6 +51,8 @@ Following tests are defined (with perf commands):
perf record --call-graph fp kill (test-record-graph-fp-aarch64)
perf record -e '{cycles,instructions}' kill (test-record-group1)
perf record -e '{cycles/period=1/,instructions/period=2/}:S' kill (test-record-group2)
+ perf record -e '{cycles,cache-misses}:S' kill (test-record-group-sampling1)
+ perf record -c 10000 -e '{cycles,cache-misses}:S' kill (test-record-group-sampling2)
perf record -D kill (test-record-no-delay)
perf record -i kill (test-record-no-inherit)
perf record -n kill (test-record-no-samples)
diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/tests/attr/test-record-group-sampling
index 97e7e64a38f0..c32ceb156226 100644
--- a/tools/perf/tests/attr/test-record-group-sampling
+++ b/tools/perf/tests/attr/test-record-group-sampling
@@ -2,6 +2,7 @@
command = record
args = --no-bpf-event -e '{cycles,cache-misses}:S' kill >/dev/null 2>&1
ret = 1
+kernel_until = 6.10
[event-1:base-record]
fd=1
@@ -18,7 +19,7 @@ group_fd=1
type=0
config=3
-# default | PERF_SAMPLE_READ
+# default | PERF_SAMPLE_READ | PERF_SAMPLE_PERIOD
sample_type=343
# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST
diff --git a/tools/perf/tests/attr/test-record-group-sampling1 b/tools/perf/tests/attr/test-record-group-sampling1
new file mode 100644
index 000000000000..db6404c1ef78
--- /dev/null
+++ b/tools/perf/tests/attr/test-record-group-sampling1
@@ -0,0 +1,51 @@
+[config]
+command = record
+args = --no-bpf-event -e '{cycles,cache-misses}:S' kill >/dev/null 2>&1
+ret = 1
+kernel_since = 6.10
+
+[event-1:base-record]
+fd=1
+group_fd=-1
+
+# cycles
+type=0
+config=0
+
+# default | PERF_SAMPLE_READ | PERF_SAMPLE_PERIOD
+sample_type=343
+
+# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING
+read_format=28|31
+task=1
+mmap=1
+comm=1
+enable_on_exec=1
+disabled=1
+
+# inherit is enabled for group sampling
+inherit=1
+
+[event-2:base-record]
+fd=2
+group_fd=1
+
+# cache-misses
+type=0
+config=3
+
+# default | PERF_SAMPLE_READ | PERF_SAMPLE_PERIOD
+sample_type=343
+
+# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING
+read_format=28|31
+task=0
+mmap=0
+comm=0
+enable_on_exec=0
+disabled=0
+freq=0
+
+# inherit is enabled for group sampling
+inherit=1
+
diff --git a/tools/perf/tests/attr/test-record-group-sampling2 b/tools/perf/tests/attr/test-record-group-sampling2
new file mode 100644
index 000000000000..32884df78c95
--- /dev/null
+++ b/tools/perf/tests/attr/test-record-group-sampling2
@@ -0,0 +1,61 @@
+[config]
+command = record
+args = --no-bpf-event -c 10000 -e '{cycles,cache-misses}:S' kill >/dev/null 2>&1
+ret = 1
+kernel_since = 6.10
+
+[event-1:base-record]
+fd=1
+group_fd=-1
+
+# cycles
+type=0
+config=0
+
+# default | PERF_SAMPLE_READ
+sample_type=87
+
+# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING
+read_format=28|31
+task=1
+mmap=1
+comm=1
+enable_on_exec=1
+disabled=1
+
+# inherit is enabled for group sampling
+inherit=1
+
+# sampling disabled
+sample_freq=0
+sample_period=10000
+freq=0
+write_backward=0
+
+[event-2:base-record]
+fd=2
+group_fd=1
+
+# cache-misses
+type=0
+config=3
+
+# default | PERF_SAMPLE_READ
+sample_type=87
+
+# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING
+read_format=28|31
+task=0
+mmap=0
+comm=0
+enable_on_exec=0
+disabled=0
+
+# inherit is enabled for group sampling
+inherit=1
+
+# sampling disabled
+sample_freq=0
+sample_period=0
+freq=0
+write_backward=0
diff --git a/tools/perf/tests/attr/test-record-group2 b/tools/perf/tests/attr/test-record-group2
index cebdaa8e64e4..8fe6c679618c 100644
--- a/tools/perf/tests/attr/test-record-group2
+++ b/tools/perf/tests/attr/test-record-group2
@@ -2,6 +2,7 @@
command = record
args = --no-bpf-event -e '{cycles/period=1234000/,instructions/period=6789000/}:S' kill >/dev/null 2>&1
ret = 1
+kernel_until = 6.10
[event-1:base-record]
fd=1
diff --git a/tools/perf/tests/attr/test-record-group2 b/tools/perf/tests/attr/test-record-group3
similarity index 81%
copy from tools/perf/tests/attr/test-record-group2
copy to tools/perf/tests/attr/test-record-group3
index cebdaa8e64e4..72e6a9ee2f60 100644
--- a/tools/perf/tests/attr/test-record-group2
+++ b/tools/perf/tests/attr/test-record-group3
@@ -2,6 +2,7 @@
command = record
args = --no-bpf-event -e '{cycles/period=1234000/,instructions/period=6789000/}:S' kill >/dev/null 2>&1
ret = 1
+kernel_since = 6.10
[event-1:base-record]
fd=1
@@ -9,8 +10,9 @@ group_fd=-1
config=0|1
sample_period=1234000
sample_type=87
-read_format=12|28
-inherit=0
+read_format=28|31
+disabled=1
+inherit=1
freq=0
[event-2:base-record]
@@ -19,9 +21,9 @@ group_fd=1
config=0|1
sample_period=6789000
sample_type=87
-read_format=12|28
+read_format=28|31
disabled=0
-inherit=0
+inherit=1
mmap=0
comm=0
freq=0
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4f818ab6b662..66b782811997 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1162,7 +1162,15 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
*/
if (leader->core.nr_members > 1) {
attr->read_format |= PERF_FORMAT_GROUP;
- attr->inherit = 0;
+ }
+
+ /*
+ * Inherit + SAMPLE_READ requires SAMPLE_TID in the read_format
+ */
+ if (attr->inherit) {
+ evsel__set_sample_bit(evsel, TID);
+ evsel->core.attr.read_format |=
+ PERF_FORMAT_ID;
}
}
@@ -1838,6 +1846,8 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
static void evsel__disable_missing_features(struct evsel *evsel)
{
+ if (perf_missing_features.inherit_sample_read)
+ evsel->core.attr.inherit = 0;
if (perf_missing_features.branch_counters)
evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_COUNTERS;
if (perf_missing_features.read_lost)
@@ -1893,7 +1903,12 @@ bool evsel__detect_missing_features(struct evsel *evsel)
* Must probe features in the order they were added to the
* perf_event_attr interface.
*/
- if (!perf_missing_features.branch_counters &&
+ if (!perf_missing_features.inherit_sample_read &&
+ evsel->core.attr.inherit && (evsel->core.attr.sample_type & PERF_SAMPLE_READ)) {
+ perf_missing_features.inherit_sample_read = true;
+ pr_debug2("Using PERF_SAMPLE_READ / :S modifier is not compatible with inherit, falling back to no-inherit.\n");
+ return true;
+ } else if (!perf_missing_features.branch_counters &&
(evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) {
perf_missing_features.branch_counters = true;
pr_debug2("switching off branch counters support\n");
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 375a38e15cd9..911c2fd42c6d 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -192,6 +192,7 @@ struct perf_missing_features {
bool weight_struct;
bool read_lost;
bool branch_counters;
+ bool inherit_sample_read;
};
extern struct perf_missing_features perf_missing_features;
--
2.45.2
^ permalink raw reply related [flat|nested] 10+ messages in thread