linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] perf/core: Save raw sample data conditionally based on sample type
@ 2024-05-02  0:06 Yabin Cui
  2024-05-09 19:45 ` Namhyung Kim
  0 siblings, 1 reply; 4+ messages in thread
From: Yabin Cui @ 2024-05-02  0:06 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter
  Cc: linux-perf-users, linux-kernel, bpf, Yabin Cui

Currently, space for raw sample data is always allocated within sample
records for both BPF output and tracepoint events. This leads to unused
space in sample records when raw sample data is not requested.

This patch checks sample type of an event before saving raw sample data
in both BPF output and tracepoint event handling logic. Raw sample data
will only be saved if explicitly requested, reducing overhead when it
is not needed.

Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper")
Signed-off-by: Yabin Cui <yabinc@google.com>
---

Changes since v1:
 - Check event->attr.sample_type & PERF_SAMPLE_RAW before
   calling perf_sample_save_raw_data().
 - Subject has been changed to reflect the change of solution.

Original commit message from v1:
perf/core: Trim dyn_size if raw data is absent

 kernel/events/core.c     | 37 ++++++++++++++++++++-----------------
 kernel/trace/bpf_trace.c | 12 +++++++-----
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..dc5f3147feef 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10120,9 +10120,9 @@ static struct pmu perf_tracepoint = {
 };
 
 static int perf_tp_filter_match(struct perf_event *event,
-				struct perf_sample_data *data)
+				struct perf_raw_record *raw)
 {
-	void *record = data->raw->frag.data;
+	void *record = raw->frag.data;
 
 	/* only top level events have filters set */
 	if (event->parent)
@@ -10134,7 +10134,7 @@ static int perf_tp_filter_match(struct perf_event *event,
 }
 
 static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data,
+				struct perf_raw_record *raw,
 				struct pt_regs *regs)
 {
 	if (event->hw.state & PERF_HES_STOPPED)
@@ -10145,7 +10145,7 @@ static int perf_tp_event_match(struct perf_event *event,
 	if (event->attr.exclude_kernel && !user_mode(regs))
 		return 0;
 
-	if (!perf_tp_filter_match(event, data))
+	if (!perf_tp_filter_match(event, raw))
 		return 0;
 
 	return 1;
@@ -10171,6 +10171,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 static void __perf_tp_event_target_task(u64 count, void *record,
 					struct pt_regs *regs,
 					struct perf_sample_data *data,
+					struct perf_raw_record *raw,
 					struct perf_event *event)
 {
 	struct trace_entry *entry = record;
@@ -10180,13 +10181,18 @@ static void __perf_tp_event_target_task(u64 count, void *record,
 	/* Cannot deliver synchronous signal to other task. */
 	if (event->attr.sigtrap)
 		return;
-	if (perf_tp_event_match(event, data, regs))
+	if (perf_tp_event_match(event, raw, regs)) {
+		perf_sample_data_init(data, 0, 0);
+		if (event->attr.sample_type & PERF_SAMPLE_RAW)
+			perf_sample_save_raw_data(data, raw);
 		perf_swevent_event(event, count, data, regs);
+	}
 }
 
 static void perf_tp_event_target_task(u64 count, void *record,
 				      struct pt_regs *regs,
 				      struct perf_sample_data *data,
+				      struct perf_raw_record *raw,
 				      struct perf_event_context *ctx)
 {
 	unsigned int cpu = smp_processor_id();
@@ -10194,15 +10200,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
 	struct perf_event *event, *sibling;
 
 	perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
-		__perf_tp_event_target_task(count, record, regs, data, event);
+		__perf_tp_event_target_task(count, record, regs, data, raw, event);
 		for_each_sibling_event(sibling, event)
-			__perf_tp_event_target_task(count, record, regs, data, sibling);
+			__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
 	}
 
 	perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
-		__perf_tp_event_target_task(count, record, regs, data, event);
+		__perf_tp_event_target_task(count, record, regs, data, raw, event);
 		for_each_sibling_event(sibling, event)
-			__perf_tp_event_target_task(count, record, regs, data, sibling);
+			__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
 	}
 }
 
@@ -10220,15 +10226,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		},
 	};
 
-	perf_sample_data_init(&data, 0, 0);
-	perf_sample_save_raw_data(&data, &raw);
-
 	perf_trace_buf_update(record, event_type);
 
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
-		if (perf_tp_event_match(event, &data, regs)) {
-			perf_swevent_event(event, count, &data, regs);
-
+		if (perf_tp_event_match(event, &raw, regs)) {
 			/*
 			 * Here use the same on-stack perf_sample_data,
 			 * some members in data are event-specific and
@@ -10238,7 +10239,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 			 * because data->sample_flags is set.
 			 */
 			perf_sample_data_init(&data, 0, 0);
-			perf_sample_save_raw_data(&data, &raw);
+			if (event->attr.sample_type & PERF_SAMPLE_RAW)
+				perf_sample_save_raw_data(&data, &raw);
+			perf_swevent_event(event, count, &data, regs);
 		}
 	}
 
@@ -10255,7 +10258,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 			goto unlock;
 
 		raw_spin_lock(&ctx->lock);
-		perf_tp_event_target_task(count, record, regs, &data, ctx);
+		perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
 		raw_spin_unlock(&ctx->lock);
 unlock:
 		rcu_read_unlock();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9dc605f08a23..4b3ff71b4c0a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -620,7 +620,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
 
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
-			u64 flags, struct perf_sample_data *sd)
+			u64 flags, struct perf_raw_record *raw,
+			struct perf_sample_data *sd)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
@@ -645,6 +646,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
+	if (event->attr.sample_type & PERF_SAMPLE_RAW)
+		perf_sample_save_raw_data(sd, raw);
+
 	return perf_event_output(event, sd, regs);
 }
 
@@ -688,9 +692,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	}
 
 	perf_sample_data_init(sd, 0, 0);
-	perf_sample_save_raw_data(sd, &raw);
 
-	err = __bpf_perf_event_output(regs, map, flags, sd);
+	err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
 out:
 	this_cpu_dec(bpf_trace_nest_level);
 	preempt_enable();
@@ -749,9 +752,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 
 	perf_fetch_caller_regs(regs);
 	perf_sample_data_init(sd, 0, 0);
-	perf_sample_save_raw_data(sd, &raw);
 
-	ret = __bpf_perf_event_output(regs, map, flags, sd);
+	ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
 out:
 	this_cpu_dec(bpf_event_output_nest_level);
 	preempt_enable();
-- 
2.45.0.rc0.197.gbae5840b3b-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] perf/core: Save raw sample data conditionally based on sample type
  2024-05-02  0:06 [PATCH v2] perf/core: Save raw sample data conditionally based on sample type Yabin Cui
@ 2024-05-09 19:45 ` Namhyung Kim
  2024-05-09 19:55   ` Namhyung Kim
  0 siblings, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2024-05-09 19:45 UTC (permalink / raw)
  To: Yabin Cui
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Ian Rogers,
	Adrian Hunter, linux-perf-users, linux-kernel, bpf

Hello,

On Wed, May 1, 2024 at 5:06 PM Yabin Cui <yabinc@google.com> wrote:
>
> Currently, space for raw sample data is always allocated within sample
> records for both BPF output and tracepoint events. This leads to unused
> space in sample records when raw sample data is not requested.

Oh, I thought it was ok because even if it sets _RAW bit in the
data->sample_flags unconditionally, perf_prepare_sample() and
perf_output_sample() checks the original event's attr->sample_type
so the raw data won't be recorded.

But I've realized that it increased data->dyn_size already. :(
Which means the sample would have garbage at the end.

I need to check if there are other places that made the same
mistake for other sample types.

>
> This patch checks sample type of an event before saving raw sample data
> in both BPF output and tracepoint event handling logic. Raw sample data
> will only be saved if explicitly requested, reducing overhead when it
> is not needed.
>
> Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper")
> Signed-off-by: Yabin Cui <yabinc@google.com>

Acked-by: Namhyung Kim <namhyung@kernel.org>

Thanks,
Namhyung

> ---
>
> Changes since v1:
>  - Check event->attr.sample_type & PERF_SAMPLE_RAW before
>    calling perf_sample_save_raw_data().
>  - Subject has been changed to reflect the change of solution.
>
> Original commit message from v1:
> perf/core: Trim dyn_size if raw data is absent
>
>  kernel/events/core.c     | 37 ++++++++++++++++++++-----------------
>  kernel/trace/bpf_trace.c | 12 +++++++-----
>  2 files changed, 27 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 724e6d7e128f..dc5f3147feef 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -10120,9 +10120,9 @@ static struct pmu perf_tracepoint = {
>  };
>
>  static int perf_tp_filter_match(struct perf_event *event,
> -                               struct perf_sample_data *data)
> +                               struct perf_raw_record *raw)
>  {
> -       void *record = data->raw->frag.data;
> +       void *record = raw->frag.data;
>
>         /* only top level events have filters set */
>         if (event->parent)
> @@ -10134,7 +10134,7 @@ static int perf_tp_filter_match(struct perf_event *event,
>  }
>
>  static int perf_tp_event_match(struct perf_event *event,
> -                               struct perf_sample_data *data,
> +                               struct perf_raw_record *raw,
>                                 struct pt_regs *regs)
>  {
>         if (event->hw.state & PERF_HES_STOPPED)
> @@ -10145,7 +10145,7 @@ static int perf_tp_event_match(struct perf_event *event,
>         if (event->attr.exclude_kernel && !user_mode(regs))
>                 return 0;
>
> -       if (!perf_tp_filter_match(event, data))
> +       if (!perf_tp_filter_match(event, raw))
>                 return 0;
>
>         return 1;
> @@ -10171,6 +10171,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
>  static void __perf_tp_event_target_task(u64 count, void *record,
>                                         struct pt_regs *regs,
>                                         struct perf_sample_data *data,
> +                                       struct perf_raw_record *raw,
>                                         struct perf_event *event)
>  {
>         struct trace_entry *entry = record;
> @@ -10180,13 +10181,18 @@ static void __perf_tp_event_target_task(u64 count, void *record,
>         /* Cannot deliver synchronous signal to other task. */
>         if (event->attr.sigtrap)
>                 return;
> -       if (perf_tp_event_match(event, data, regs))
> +       if (perf_tp_event_match(event, raw, regs)) {
> +               perf_sample_data_init(data, 0, 0);
> +               if (event->attr.sample_type & PERF_SAMPLE_RAW)
> +                       perf_sample_save_raw_data(data, raw);
>                 perf_swevent_event(event, count, data, regs);
> +       }
>  }
>
>  static void perf_tp_event_target_task(u64 count, void *record,
>                                       struct pt_regs *regs,
>                                       struct perf_sample_data *data,
> +                                     struct perf_raw_record *raw,
>                                       struct perf_event_context *ctx)
>  {
>         unsigned int cpu = smp_processor_id();
> @@ -10194,15 +10200,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
>         struct perf_event *event, *sibling;
>
>         perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
> -               __perf_tp_event_target_task(count, record, regs, data, event);
> +               __perf_tp_event_target_task(count, record, regs, data, raw, event);
>                 for_each_sibling_event(sibling, event)
> -                       __perf_tp_event_target_task(count, record, regs, data, sibling);
> +                       __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
>         }
>
>         perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
> -               __perf_tp_event_target_task(count, record, regs, data, event);
> +               __perf_tp_event_target_task(count, record, regs, data, raw, event);
>                 for_each_sibling_event(sibling, event)
> -                       __perf_tp_event_target_task(count, record, regs, data, sibling);
> +                       __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
>         }
>  }
>
> @@ -10220,15 +10226,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
>                 },
>         };
>
> -       perf_sample_data_init(&data, 0, 0);
> -       perf_sample_save_raw_data(&data, &raw);
> -
>         perf_trace_buf_update(record, event_type);
>
>         hlist_for_each_entry_rcu(event, head, hlist_entry) {
> -               if (perf_tp_event_match(event, &data, regs)) {
> -                       perf_swevent_event(event, count, &data, regs);
> -
> +               if (perf_tp_event_match(event, &raw, regs)) {
>                         /*
>                          * Here use the same on-stack perf_sample_data,
>                          * some members in data are event-specific and
> @@ -10238,7 +10239,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
>                          * because data->sample_flags is set.
>                          */
>                         perf_sample_data_init(&data, 0, 0);
> -                       perf_sample_save_raw_data(&data, &raw);
> +                       if (event->attr.sample_type & PERF_SAMPLE_RAW)
> +                               perf_sample_save_raw_data(&data, &raw);
> +                       perf_swevent_event(event, count, &data, regs);
>                 }
>         }
>
> @@ -10255,7 +10258,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
>                         goto unlock;
>
>                 raw_spin_lock(&ctx->lock);
> -               perf_tp_event_target_task(count, record, regs, &data, ctx);
> +               perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
>                 raw_spin_unlock(&ctx->lock);
>  unlock:
>                 rcu_read_unlock();
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 9dc605f08a23..4b3ff71b4c0a 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -620,7 +620,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
>
>  static __always_inline u64
>  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> -                       u64 flags, struct perf_sample_data *sd)
> +                       u64 flags, struct perf_raw_record *raw,
> +                       struct perf_sample_data *sd)
>  {
>         struct bpf_array *array = container_of(map, struct bpf_array, map);
>         unsigned int cpu = smp_processor_id();
> @@ -645,6 +646,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>         if (unlikely(event->oncpu != cpu))
>                 return -EOPNOTSUPP;
>
> +       if (event->attr.sample_type & PERF_SAMPLE_RAW)
> +               perf_sample_save_raw_data(sd, raw);
> +
>         return perf_event_output(event, sd, regs);
>  }
>
> @@ -688,9 +692,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
>         }
>
>         perf_sample_data_init(sd, 0, 0);
> -       perf_sample_save_raw_data(sd, &raw);
>
> -       err = __bpf_perf_event_output(regs, map, flags, sd);
> +       err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
>  out:
>         this_cpu_dec(bpf_trace_nest_level);
>         preempt_enable();
> @@ -749,9 +752,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>
>         perf_fetch_caller_regs(regs);
>         perf_sample_data_init(sd, 0, 0);
> -       perf_sample_save_raw_data(sd, &raw);
>
> -       ret = __bpf_perf_event_output(regs, map, flags, sd);
> +       ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
>  out:
>         this_cpu_dec(bpf_event_output_nest_level);
>         preempt_enable();
> --
> 2.45.0.rc0.197.gbae5840b3b-goog
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] perf/core: Save raw sample data conditionally based on sample type
  2024-05-09 19:45 ` Namhyung Kim
@ 2024-05-09 19:55   ` Namhyung Kim
  2024-05-09 21:04     ` Yabin Cui
  0 siblings, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2024-05-09 19:55 UTC (permalink / raw)
  To: Yabin Cui
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Ian Rogers,
	Adrian Hunter, linux-perf-users, linux-kernel, bpf

Another thought.

On Thu, May 9, 2024 at 12:45 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello,
>
> On Wed, May 1, 2024 at 5:06 PM Yabin Cui <yabinc@google.com> wrote:
> >
> > Currently, space for raw sample data is always allocated within sample
> > records for both BPF output and tracepoint events. This leads to unused
> > space in sample records when raw sample data is not requested.
>
> Oh, I thought it was ok because even if it sets _RAW bit in the
> data->sample_flags unconditionally, perf_prepare_sample() and
> perf_output_sample() checks the original event's attr->sample_type
> so the raw data won't be recorded.
>
> But I've realized that it increased data->dyn_size already. :(
> Which means the sample would have garbage at the end.
>
> I need to check if there are other places that made the same
> mistake for other sample types.
>
> >
> > This patch checks sample type of an event before saving raw sample data
> > in both BPF output and tracepoint event handling logic. Raw sample data
> > will only be saved if explicitly requested, reducing overhead when it
> > is not needed.
> >
> > Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper")
> > Signed-off-by: Yabin Cui <yabinc@google.com>
>
> Acked-by: Namhyung Kim <namhyung@kernel.org>
>
> Thanks,
> Namhyung
>
> > ---
[SNIP]
> > @@ -10180,13 +10181,18 @@ static void __perf_tp_event_target_task(u64 count, void *record,
> >         /* Cannot deliver synchronous signal to other task. */
> >         if (event->attr.sigtrap)
> >                 return;
> > -       if (perf_tp_event_match(event, data, regs))
> > +       if (perf_tp_event_match(event, raw, regs)) {
> > +               perf_sample_data_init(data, 0, 0);
> > +               if (event->attr.sample_type & PERF_SAMPLE_RAW)
> > +                       perf_sample_save_raw_data(data, raw);

Hmm.. to prevent future mistakes, maybe we can move the
check into the perf_sample_save_raw_data() itself.

And it'd be great if you could do the same for callchain
and brstack too. :)

Thanks,
Namhyung


> >                 perf_swevent_event(event, count, data, regs);
> > +       }
> >  }
> >
> >  static void perf_tp_event_target_task(u64 count, void *record,
> >                                       struct pt_regs *regs,
> >                                       struct perf_sample_data *data,
> > +                                     struct perf_raw_record *raw,
> >                                       struct perf_event_context *ctx)
> >  {
> >         unsigned int cpu = smp_processor_id();
> > @@ -10194,15 +10200,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
> >         struct perf_event *event, *sibling;
> >
> >         perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
> > -               __perf_tp_event_target_task(count, record, regs, data, event);
> > +               __perf_tp_event_target_task(count, record, regs, data, raw, event);
> >                 for_each_sibling_event(sibling, event)
> > -                       __perf_tp_event_target_task(count, record, regs, data, sibling);
> > +                       __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
> >         }
> >
> >         perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
> > -               __perf_tp_event_target_task(count, record, regs, data, event);
> > +               __perf_tp_event_target_task(count, record, regs, data, raw, event);
> >                 for_each_sibling_event(sibling, event)
> > -                       __perf_tp_event_target_task(count, record, regs, data, sibling);
> > +                       __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
> >         }
> >  }
> >
> > @@ -10220,15 +10226,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> >                 },
> >         };
> >
> > -       perf_sample_data_init(&data, 0, 0);
> > -       perf_sample_save_raw_data(&data, &raw);
> > -
> >         perf_trace_buf_update(record, event_type);
> >
> >         hlist_for_each_entry_rcu(event, head, hlist_entry) {
> > -               if (perf_tp_event_match(event, &data, regs)) {
> > -                       perf_swevent_event(event, count, &data, regs);
> > -
> > +               if (perf_tp_event_match(event, &raw, regs)) {
> >                         /*
> >                          * Here use the same on-stack perf_sample_data,
> >                          * some members in data are event-specific and
> > @@ -10238,7 +10239,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> >                          * because data->sample_flags is set.
> >                          */
> >                         perf_sample_data_init(&data, 0, 0);
> > -                       perf_sample_save_raw_data(&data, &raw);
> > +                       if (event->attr.sample_type & PERF_SAMPLE_RAW)
> > +                               perf_sample_save_raw_data(&data, &raw);
> > +                       perf_swevent_event(event, count, &data, regs);
> >                 }
> >         }
> >
> > @@ -10255,7 +10258,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> >                         goto unlock;
> >
> >                 raw_spin_lock(&ctx->lock);
> > -               perf_tp_event_target_task(count, record, regs, &data, ctx);
> > +               perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
> >                 raw_spin_unlock(&ctx->lock);
> >  unlock:
> >                 rcu_read_unlock();
> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > index 9dc605f08a23..4b3ff71b4c0a 100644
> > --- a/kernel/trace/bpf_trace.c
> > +++ b/kernel/trace/bpf_trace.c
> > @@ -620,7 +620,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
> >
> >  static __always_inline u64
> >  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > -                       u64 flags, struct perf_sample_data *sd)
> > +                       u64 flags, struct perf_raw_record *raw,
> > +                       struct perf_sample_data *sd)
> >  {
> >         struct bpf_array *array = container_of(map, struct bpf_array, map);
> >         unsigned int cpu = smp_processor_id();
> > @@ -645,6 +646,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> >         if (unlikely(event->oncpu != cpu))
> >                 return -EOPNOTSUPP;
> >
> > +       if (event->attr.sample_type & PERF_SAMPLE_RAW)
> > +               perf_sample_save_raw_data(sd, raw);
> > +
> >         return perf_event_output(event, sd, regs);
> >  }
> >
> > @@ -688,9 +692,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> >         }
> >
> >         perf_sample_data_init(sd, 0, 0);
> > -       perf_sample_save_raw_data(sd, &raw);
> >
> > -       err = __bpf_perf_event_output(regs, map, flags, sd);
> > +       err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
> >  out:
> >         this_cpu_dec(bpf_trace_nest_level);
> >         preempt_enable();
> > @@ -749,9 +752,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
> >
> >         perf_fetch_caller_regs(regs);
> >         perf_sample_data_init(sd, 0, 0);
> > -       perf_sample_save_raw_data(sd, &raw);
> >
> > -       ret = __bpf_perf_event_output(regs, map, flags, sd);
> > +       ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
> >  out:
> >         this_cpu_dec(bpf_event_output_nest_level);
> >         preempt_enable();
> > --
> > 2.45.0.rc0.197.gbae5840b3b-goog
> >

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] perf/core: Save raw sample data conditionally based on sample type
  2024-05-09 19:55   ` Namhyung Kim
@ 2024-05-09 21:04     ` Yabin Cui
  0 siblings, 0 replies; 4+ messages in thread
From: Yabin Cui @ 2024-05-09 21:04 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Ian Rogers,
	Adrian Hunter, linux-perf-users, linux-kernel, bpf

Thanks for reviewing the patch! I will update a v3 patch based on the comments.


On Thu, May 9, 2024 at 12:55 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Another thought.
>
> On Thu, May 9, 2024 at 12:45 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Hello,
> >
> > On Wed, May 1, 2024 at 5:06 PM Yabin Cui <yabinc@google.com> wrote:
> > >
> > > Currently, space for raw sample data is always allocated within sample
> > > records for both BPF output and tracepoint events. This leads to unused
> > > space in sample records when raw sample data is not requested.
> >
> > Oh, I thought it was ok because even if it sets _RAW bit in the
> > data->sample_flags unconditionally, perf_prepare_sample() and
> > perf_output_sample() checks the original event's attr->sample_type
> > so the raw data won't be recorded.
> >
> > But I've realized that it increased data->dyn_size already. :(
> > Which means the sample would have garbage at the end.
> >
> > I need to check if there are other places that made the same
> > mistake for other sample types.
> >
> > >
> > > This patch checks sample type of an event before saving raw sample data
> > > in both BPF output and tracepoint event handling logic. Raw sample data
> > > will only be saved if explicitly requested, reducing overhead when it
> > > is not needed.
> > >
> > > Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper")
> > > Signed-off-by: Yabin Cui <yabinc@google.com>
> >
> > Acked-by: Namhyung Kim <namhyung@kernel.org>
> >
> > Thanks,
> > Namhyung
> >
> > > ---
> [SNIP]
> > > @@ -10180,13 +10181,18 @@ static void __perf_tp_event_target_task(u64 count, void *record,
> > >         /* Cannot deliver synchronous signal to other task. */
> > >         if (event->attr.sigtrap)
> > >                 return;
> > > -       if (perf_tp_event_match(event, data, regs))
> > > +       if (perf_tp_event_match(event, raw, regs)) {
> > > +               perf_sample_data_init(data, 0, 0);
> > > +               if (event->attr.sample_type & PERF_SAMPLE_RAW)
> > > +                       perf_sample_save_raw_data(data, raw);
>
> Hmm.. to prevent future mistakes, maybe we can move the
> check into the perf_sample_save_raw_data() itself.
>
> And it'd be great if you could do the same for callchain
> and brstack too. :)
>
> Thanks,
> Namhyung
>
>
> > >                 perf_swevent_event(event, count, data, regs);
> > > +       }
> > >  }
> > >
> > >  static void perf_tp_event_target_task(u64 count, void *record,
> > >                                       struct pt_regs *regs,
> > >                                       struct perf_sample_data *data,
> > > +                                     struct perf_raw_record *raw,
> > >                                       struct perf_event_context *ctx)
> > >  {
> > >         unsigned int cpu = smp_processor_id();
> > > @@ -10194,15 +10200,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
> > >         struct perf_event *event, *sibling;
> > >
> > >         perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
> > > -               __perf_tp_event_target_task(count, record, regs, data, event);
> > > +               __perf_tp_event_target_task(count, record, regs, data, raw, event);
> > >                 for_each_sibling_event(sibling, event)
> > > -                       __perf_tp_event_target_task(count, record, regs, data, sibling);
> > > +                       __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
> > >         }
> > >
> > >         perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
> > > -               __perf_tp_event_target_task(count, record, regs, data, event);
> > > +               __perf_tp_event_target_task(count, record, regs, data, raw, event);
> > >                 for_each_sibling_event(sibling, event)
> > > -                       __perf_tp_event_target_task(count, record, regs, data, sibling);
> > > +                       __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
> > >         }
> > >  }
> > >
> > > @@ -10220,15 +10226,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> > >                 },
> > >         };
> > >
> > > -       perf_sample_data_init(&data, 0, 0);
> > > -       perf_sample_save_raw_data(&data, &raw);
> > > -
> > >         perf_trace_buf_update(record, event_type);
> > >
> > >         hlist_for_each_entry_rcu(event, head, hlist_entry) {
> > > -               if (perf_tp_event_match(event, &data, regs)) {
> > > -                       perf_swevent_event(event, count, &data, regs);
> > > -
> > > +               if (perf_tp_event_match(event, &raw, regs)) {
> > >                         /*
> > >                          * Here use the same on-stack perf_sample_data,
> > >                          * some members in data are event-specific and
> > > @@ -10238,7 +10239,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> > >                          * because data->sample_flags is set.
> > >                          */
> > >                         perf_sample_data_init(&data, 0, 0);
> > > -                       perf_sample_save_raw_data(&data, &raw);
> > > +                       if (event->attr.sample_type & PERF_SAMPLE_RAW)
> > > +                               perf_sample_save_raw_data(&data, &raw);
> > > +                       perf_swevent_event(event, count, &data, regs);
> > >                 }
> > >         }
> > >
> > > @@ -10255,7 +10258,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> > >                         goto unlock;
> > >
> > >                 raw_spin_lock(&ctx->lock);
> > > -               perf_tp_event_target_task(count, record, regs, &data, ctx);
> > > +               perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
> > >                 raw_spin_unlock(&ctx->lock);
> > >  unlock:
> > >                 rcu_read_unlock();
> > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > index 9dc605f08a23..4b3ff71b4c0a 100644
> > > --- a/kernel/trace/bpf_trace.c
> > > +++ b/kernel/trace/bpf_trace.c
> > > @@ -620,7 +620,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
> > >
> > >  static __always_inline u64
> > >  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > > -                       u64 flags, struct perf_sample_data *sd)
> > > +                       u64 flags, struct perf_raw_record *raw,
> > > +                       struct perf_sample_data *sd)
> > >  {
> > >         struct bpf_array *array = container_of(map, struct bpf_array, map);
> > >         unsigned int cpu = smp_processor_id();
> > > @@ -645,6 +646,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > >         if (unlikely(event->oncpu != cpu))
> > >                 return -EOPNOTSUPP;
> > >
> > > +       if (event->attr.sample_type & PERF_SAMPLE_RAW)
> > > +               perf_sample_save_raw_data(sd, raw);
> > > +
> > >         return perf_event_output(event, sd, regs);
> > >  }
> > >
> > > @@ -688,9 +692,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> > >         }
> > >
> > >         perf_sample_data_init(sd, 0, 0);
> > > -       perf_sample_save_raw_data(sd, &raw);
> > >
> > > -       err = __bpf_perf_event_output(regs, map, flags, sd);
> > > +       err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
> > >  out:
> > >         this_cpu_dec(bpf_trace_nest_level);
> > >         preempt_enable();
> > > @@ -749,9 +752,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
> > >
> > >         perf_fetch_caller_regs(regs);
> > >         perf_sample_data_init(sd, 0, 0);
> > > -       perf_sample_save_raw_data(sd, &raw);
> > >
> > > -       ret = __bpf_perf_event_output(regs, map, flags, sd);
> > > +       ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
> > >  out:
> > >         this_cpu_dec(bpf_event_output_nest_level);
> > >         preempt_enable();
> > > --
> > > 2.45.0.rc0.197.gbae5840b3b-goog
> > >

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-05-09 21:04 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-02  0:06 [PATCH v2] perf/core: Save raw sample data conditionally based on sample type Yabin Cui
2024-05-09 19:45 ` Namhyung Kim
2024-05-09 19:55   ` Namhyung Kim
2024-05-09 21:04     ` Yabin Cui

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).