[PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()
@ 2024-01-09 21:36 Namhyung Kim
  2024-01-09 21:36 ` [PATCH RESEND 2/2] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
  2024-01-10 14:49 ` [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Mark Rutland
  0 siblings, 2 replies; 6+ messages in thread
From: Namhyung Kim @ 2024-01-09 21:36 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Mingwei Zhang, Ian Rogers, Kan Liang

It was unnecessarily disabling and enabling PMUs for each event.  It
should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
at each PMU.  As pmu context has separate active lists for pinned group
and flexible group, factor out a new function to do the job.

Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
even if it needs to unthrottle sampling events.

Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Mingwei Zhang <mizhang@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..b2ff60fa487e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -883,6 +883,7 @@ struct perf_event_pmu_context {
 
 	unsigned int			nr_events;
 	unsigned int			nr_cgroups;
+	unsigned int			nr_freq;
 
 	atomic_t			refcount; /* event <-> epc */
 	struct rcu_head			rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 59b332cce9e7..ce9db9dbfd4c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
 
 	if (!is_software_event(event))
 		cpc->active_oncpu--;
-	if (event->attr.freq && event->attr.sample_freq)
+	if (event->attr.freq && event->attr.sample_freq) {
 		ctx->nr_freq--;
+		epc->nr_freq--;
+	}
 	if (event->attr.exclusive || !cpc->active_oncpu)
 		cpc->exclusive = 0;
 
@@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
 
 	if (!is_software_event(event))
 		cpc->active_oncpu++;
-	if (event->attr.freq && event->attr.sample_freq)
+	if (event->attr.freq && event->attr.sample_freq) {
 		ctx->nr_freq++;
-
+		epc->nr_freq++;
+	}
 	if (event->attr.exclusive)
 		cpc->exclusive = 1;
 
@@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
 	}
 }
 
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void
-perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	u64 now, period = TICK_NSEC;
 	s64 delta;
 
-	/*
-	 * only need to iterate over all events iff:
-	 * - context have events in frequency mode (needs freq adjust)
-	 * - there are events to unthrottle on this cpu
-	 */
-	if (!(ctx->nr_freq || unthrottle))
-		return;
-
-	raw_spin_lock(&ctx->lock);
-
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+	list_for_each_entry(event, event_list, active_list) {
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
@@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 		if (!event_filter_match(event))
 			continue;
 
-		perf_pmu_disable(event->pmu);
-
 		hwc = &event->hw;
 
 		if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
-			goto next;
+			continue;
 
 		/*
 		 * stop the event and update event->count
@@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 			perf_adjust_period(event, period, delta, false);
 
 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
-	next:
-		perf_pmu_enable(event->pmu);
+	}
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	/*
+	 * only need to iterate over all events iff:
+	 * - context have events in frequency mode (needs freq adjust)
+	 * - there are events to unthrottle on this cpu
+	 */
+	if (!(ctx->nr_freq || unthrottle))
+		return;
+
+	raw_spin_lock(&ctx->lock);
+
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (!(pmu_ctx->nr_freq || unthrottle))
+			continue;
+		if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+			continue;
+
+		perf_pmu_disable(pmu_ctx->pmu);
+		perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+		perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+		perf_pmu_enable(pmu_ctx->pmu);
 	}
 
 	raw_spin_unlock(&ctx->lock);
-- 
2.43.0.472.g3155946c3a-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH RESEND 2/2] perf/core: Reduce PMU access to adjust sample freq
  2024-01-09 21:36 [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
@ 2024-01-09 21:36 ` Namhyung Kim
  2024-01-10 14:49 ` [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Mark Rutland
  1 sibling, 0 replies; 6+ messages in thread
From: Namhyung Kim @ 2024-01-09 21:36 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Mingwei Zhang, Ian Rogers, Kan Liang

For throttled events, it first starts the event and then stop
unnecessarily.  As it's already stopped, it can directly adjust
the frequency and then move on.

Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce9db9dbfd4c..c80b6aa0e354 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4121,10 +4121,15 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		if (hwc->interrupts == MAX_INTERRUPTS) {
 			hwc->interrupts = 0;
 			perf_log_throttle(event, 1);
-			event->pmu->start(event, 0);
-		}
 
-		if (!event->attr.freq || !event->attr.sample_freq)
+			if (!event->attr.freq || !event->attr.sample_freq) {
+				delta = 0;
+				goto next;
+			}
+
+			if (event->hw.state & PERF_HES_STOPPED)
+				goto adjust;
+		} else if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
 		/*
@@ -4132,6 +4137,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		 */
 		event->pmu->stop(event, PERF_EF_UPDATE);
 
+adjust:
 		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
@@ -4146,6 +4152,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		if (delta > 0)
 			perf_adjust_period(event, period, delta, false);
 
+next:
 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
 	}
 }
-- 
2.43.0.472.g3155946c3a-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()
  2024-01-09 21:36 [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
  2024-01-09 21:36 ` [PATCH RESEND 2/2] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
@ 2024-01-10 14:49 ` Mark Rutland
  2024-01-10 18:27   ` Namhyung Kim
  1 sibling, 1 reply; 6+ messages in thread
From: Mark Rutland @ 2024-01-10 14:49 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Mingwei Zhang, Ian Rogers,
	Kan Liang

On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> It was unnecessarily disabling and enabling PMUs for each event.  It
> should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> at each PMU.  As pmu context has separate active lists for pinned group
> and flexible group, factor out a new function to do the job.
> 
> Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> even if it needs to unthrottle sampling events.
> 
> Reviewed-by: Ian Rogers <irogers@google.com>
> Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
> Tested-by: Mingwei Zhang <mizhang@google.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Hi,

I've taken a quick look and I don't think this is quite right for
hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
that below).

This seems to be a bunch of optimizations; was that based on inspection alone,
or have you found a workload where this has a measureable impact?

> ---
>  include/linux/perf_event.h |  1 +
>  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
>  2 files changed, 43 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d2a15c0c6f8a..b2ff60fa487e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -883,6 +883,7 @@ struct perf_event_pmu_context {
>  
>  	unsigned int			nr_events;
>  	unsigned int			nr_cgroups;
> +	unsigned int			nr_freq;
>  
>  	atomic_t			refcount; /* event <-> epc */
>  	struct rcu_head			rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 59b332cce9e7..ce9db9dbfd4c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
>  
>  	if (!is_software_event(event))
>  		cpc->active_oncpu--;
> -	if (event->attr.freq && event->attr.sample_freq)
> +	if (event->attr.freq && event->attr.sample_freq) {
>  		ctx->nr_freq--;
> +		epc->nr_freq--;
> +	}
>  	if (event->attr.exclusive || !cpc->active_oncpu)
>  		cpc->exclusive = 0;
>  
> @@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>  
>  	if (!is_software_event(event))
>  		cpc->active_oncpu++;
> -	if (event->attr.freq && event->attr.sample_freq)
> +	if (event->attr.freq && event->attr.sample_freq) {
>  		ctx->nr_freq++;
> -
> +		epc->nr_freq++;
> +	}
>  	if (event->attr.exclusive)
>  		cpc->exclusive = 1;
>  
> @@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
>  	}
>  }
>  
> -/*
> - * combine freq adjustment with unthrottling to avoid two passes over the
> - * events. At the same time, make sure, having freq events does not change
> - * the rate of unthrottling as that would introduce bias.
> - */
> -static void
> -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  {
>  	struct perf_event *event;
>  	struct hw_perf_event *hwc;
>  	u64 now, period = TICK_NSEC;
>  	s64 delta;
>  
> -	/*
> -	 * only need to iterate over all events iff:
> -	 * - context have events in frequency mode (needs freq adjust)
> -	 * - there are events to unthrottle on this cpu
> -	 */
> -	if (!(ctx->nr_freq || unthrottle))
> -		return;
> -
> -	raw_spin_lock(&ctx->lock);
> -
> -	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> +	list_for_each_entry(event, event_list, active_list) {
>  		if (event->state != PERF_EVENT_STATE_ACTIVE)
>  			continue;
>  
> @@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  		if (!event_filter_match(event))
>  			continue;
>  
> -		perf_pmu_disable(event->pmu);
> -
>  		hwc = &event->hw;
>  
>  		if (hwc->interrupts == MAX_INTERRUPTS) {
> @@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  		}
>  
>  		if (!event->attr.freq || !event->attr.sample_freq)
> -			goto next;
> +			continue;
>  
>  		/*
>  		 * stop the event and update event->count
> @@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  			perf_adjust_period(event, period, delta, false);
>  
>  		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> -	next:
> -		perf_pmu_enable(event->pmu);
> +	}
> +}
> +
> +/*
> + * combine freq adjustment with unthrottling to avoid two passes over the
> + * events. At the same time, make sure, having freq events does not change
> + * the rate of unthrottling as that would introduce bias.
> + */
> +static void
> +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	/*
> +	 * only need to iterate over all events iff:
> +	 * - context have events in frequency mode (needs freq adjust)
> +	 * - there are events to unthrottle on this cpu
> +	 */
> +	if (!(ctx->nr_freq || unthrottle))
> +		return;
> +
> +	raw_spin_lock(&ctx->lock);
> +
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		if (!(pmu_ctx->nr_freq || unthrottle))
> +			continue;
> +		if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> +			continue;
> +
> +		perf_pmu_disable(pmu_ctx->pmu);
> +		perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> +		perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> +		perf_pmu_enable(pmu_ctx->pmu);
>  	}

I don't think this is correct for big.LITTLE/hybrid systems.

Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
events for both pmu_a and pmu_b. The perf_event_context for that task will have
a perf_event_pmu_context for each PMU in its pmu_ctx_list.

Say that task is run on CPU0, and perf_event_task_tick() is called. That will
call perf_adjust_freq_unthr_context(), and it will iterate over the
pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
we'll go ahead and call the following for all of the pmu contexts in the
pmu_ctx_list:

	perf_pmu_disable(pmu_ctx->pmu);
	perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
	perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
	perf_pmu_enable(pmu_ctx->pmu);

... and that means we might call that for pmu_b, even though it's not
associated with CPU0. That could be fatal depending on what those callbacks do.

The old logic avoided that possibility implicitly, since the events for pmu_b
couldn't be active, and so the check at the start of the look would skip all of
pmu_b's events:

	if (event->state != PERF_EVENT_STATE_ACTIVE)
		continue;

We could do similar by keeping track of how many active events each
perf_event_pmu_context has, which'd allow us to do something like:

	if (pmu_ctx->nr_active == 0)
		continue;
	
How does that sound to you?

Mark.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()
  2024-01-10 14:49 ` [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Mark Rutland
@ 2024-01-10 18:27   ` Namhyung Kim
  2024-01-10 18:44     ` Mingwei Zhang
  2024-01-11 10:41     ` Mark Rutland
  0 siblings, 2 replies; 6+ messages in thread
From: Namhyung Kim @ 2024-01-10 18:27 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Mingwei Zhang, Ian Rogers,
	Kan Liang

Hi Mark,

On Wed, Jan 10, 2024 at 6:49 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> > It was unnecessarily disabling and enabling PMUs for each event.  It
> > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > at each PMU.  As pmu context has separate active lists for pinned group
> > and flexible group, factor out a new function to do the job.
> >
> > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > even if it needs to unthrottle sampling events.
> >
> > Reviewed-by: Ian Rogers <irogers@google.com>
> > Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
> > Tested-by: Mingwei Zhang <mizhang@google.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
>
> Hi,
>
> I've taken a quick look and I don't think this is quite right for
> hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
> that below).

Thanks for your review!

>
> This seems to be a bunch of optimizations; was that based on inspection alone,
> or have you found a workload where this has a measureable impact?

It's from a code inspection but I think Mingwei reported some excessive
MSR accesses for KVM use cases.  Anyway it'd increase the interrupt \
latency if you have slow (uncore) PMUs and lots of events on those PMUs.

>
> > ---
> >  include/linux/perf_event.h |  1 +
> >  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
> >  2 files changed, 43 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index d2a15c0c6f8a..b2ff60fa487e 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -883,6 +883,7 @@ struct perf_event_pmu_context {
> >
> >       unsigned int                    nr_events;
> >       unsigned int                    nr_cgroups;
> > +     unsigned int                    nr_freq;
> >
> >       atomic_t                        refcount; /* event <-> epc */
> >       struct rcu_head                 rcu_head;
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 59b332cce9e7..ce9db9dbfd4c 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> >
> >       if (!is_software_event(event))
> >               cpc->active_oncpu--;
> > -     if (event->attr.freq && event->attr.sample_freq)
> > +     if (event->attr.freq && event->attr.sample_freq) {
> >               ctx->nr_freq--;
> > +             epc->nr_freq--;
> > +     }
> >       if (event->attr.exclusive || !cpc->active_oncpu)
> >               cpc->exclusive = 0;
> >
> > @@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> >
> >       if (!is_software_event(event))
> >               cpc->active_oncpu++;
> > -     if (event->attr.freq && event->attr.sample_freq)
> > +     if (event->attr.freq && event->attr.sample_freq) {
> >               ctx->nr_freq++;
> > -
> > +             epc->nr_freq++;
> > +     }
> >       if (event->attr.exclusive)
> >               cpc->exclusive = 1;
> >
> > @@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
> >       }
> >  }
> >
> > -/*
> > - * combine freq adjustment with unthrottling to avoid two passes over the
> > - * events. At the same time, make sure, having freq events does not change
> > - * the rate of unthrottling as that would introduce bias.
> > - */
> > -static void
> > -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
> >  {
> >       struct perf_event *event;
> >       struct hw_perf_event *hwc;
> >       u64 now, period = TICK_NSEC;
> >       s64 delta;
> >
> > -     /*
> > -      * only need to iterate over all events iff:
> > -      * - context have events in frequency mode (needs freq adjust)
> > -      * - there are events to unthrottle on this cpu
> > -      */
> > -     if (!(ctx->nr_freq || unthrottle))
> > -             return;
> > -
> > -     raw_spin_lock(&ctx->lock);
> > -
> > -     list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> > +     list_for_each_entry(event, event_list, active_list) {
> >               if (event->state != PERF_EVENT_STATE_ACTIVE)
> >                       continue;
> >
> > @@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> >               if (!event_filter_match(event))
> >                       continue;
> >
> > -             perf_pmu_disable(event->pmu);
> > -
> >               hwc = &event->hw;
> >
> >               if (hwc->interrupts == MAX_INTERRUPTS) {
> > @@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> >               }
> >
> >               if (!event->attr.freq || !event->attr.sample_freq)
> > -                     goto next;
> > +                     continue;
> >
> >               /*
> >                * stop the event and update event->count
> > @@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> >                       perf_adjust_period(event, period, delta, false);
> >
> >               event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> > -     next:
> > -             perf_pmu_enable(event->pmu);
> > +     }
> > +}
> > +
> > +/*
> > + * combine freq adjustment with unthrottling to avoid two passes over the
> > + * events. At the same time, make sure, having freq events does not change
> > + * the rate of unthrottling as that would introduce bias.
> > + */
> > +static void
> > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > +{
> > +     struct perf_event_pmu_context *pmu_ctx;
> > +
> > +     /*
> > +      * only need to iterate over all events iff:
> > +      * - context have events in frequency mode (needs freq adjust)
> > +      * - there are events to unthrottle on this cpu
> > +      */
> > +     if (!(ctx->nr_freq || unthrottle))
> > +             return;
> > +
> > +     raw_spin_lock(&ctx->lock);
> > +
> > +     list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > +             if (!(pmu_ctx->nr_freq || unthrottle))
> > +                     continue;
> > +             if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > +                     continue;
> > +
> > +             perf_pmu_disable(pmu_ctx->pmu);
> > +             perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > +             perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > +             perf_pmu_enable(pmu_ctx->pmu);
> >       }
>
> I don't think this is correct for big.LITTLE/hybrid systems.
>
> Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
> events for both pmu_a and pmu_b. The perf_event_context for that task will have
> a perf_event_pmu_context for each PMU in its pmu_ctx_list.
>
> Say that task is run on CPU0, and perf_event_task_tick() is called. That will
> call perf_adjust_freq_unthr_context(), and it will iterate over the
> pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
> we'll go ahead and call the following for all of the pmu contexts in the
> pmu_ctx_list:
>
>         perf_pmu_disable(pmu_ctx->pmu);
>         perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
>         perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
>         perf_pmu_enable(pmu_ctx->pmu);
>
> ... and that means we might call that for pmu_b, even though it's not
> associated with CPU0. That could be fatal depending on what those callbacks do.

Thanks for pointing that out.  I missed the hybrid cases.

>
> The old logic avoided that possibility implicitly, since the events for pmu_b
> couldn't be active, and so the check at the start of the look would skip all of
> pmu_b's events:
>
>         if (event->state != PERF_EVENT_STATE_ACTIVE)
>                 continue;
>
> We could do similar by keeping track of how many active events each
> perf_event_pmu_context has, which'd allow us to do something like:
>
>         if (pmu_ctx->nr_active == 0)
>                 continue;
>
> How does that sound to you?

Sounds good.  Maybe we can just test if both active lists are empty.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()
  2024-01-10 18:27   ` Namhyung Kim
@ 2024-01-10 18:44     ` Mingwei Zhang
  2024-01-11 10:41     ` Mark Rutland
  1 sibling, 0 replies; 6+ messages in thread
From: Mingwei Zhang @ 2024-01-10 18:44 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Mark Rutland, Peter Zijlstra, Ingo Molnar, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Ian Rogers, Kan Liang

On Wed, Jan 10, 2024 at 10:27 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hi Mark,
>
> On Wed, Jan 10, 2024 at 6:49 AM Mark Rutland <mark.rutland@arm.com> wrote:
> >
> > On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> > > It was unnecessarily disabling and enabling PMUs for each event.  It
> > > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > > at each PMU.  As pmu context has separate active lists for pinned group
> > > and flexible group, factor out a new function to do the job.
> > >
> > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > even if it needs to unthrottle sampling events.
> > >
> > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
> > > Tested-by: Mingwei Zhang <mizhang@google.com>
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> >
> > Hi,
> >
> > I've taken a quick look and I don't think this is quite right for
> > hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
> > that below).
>
> Thanks for your review!
>
> >
> > This seems to be a bunch of optimizations; was that based on inspection alone,
> > or have you found a workload where this has a measureable impact?
>
> It's from a code inspection but I think Mingwei reported some excessive
> MSR accesses for KVM use cases.  Anyway it'd increase the interrupt \
> latency if you have slow (uncore) PMUs and lots of events on those PMUs.
>

Yes, we have observed a huge host-level overhead when guest PMU is
multiplexing events in frequency mode. The investigation finally
narrows down to this code after profiling. We find that there are
excessive MSR writes to 0x38f, which is caused by PMU disable and
enable operations. These operations are pretty heavy weight in a
virtualized environment because all of the wrmsr to 0x38f will not
directly go to HW but get redirected via perf API to the host PMU. The
round trip overhead is very high. Note that this overhead is
implicitly visible to guests, ie., when a guest is running a
CPU-saturating workload (eg., SPEC2017), they will see their vCPU
performance return to ancient times.

The interesting part that I have observed after applying this patch is
the increase of guest PMIs. This (or there might be another reason)
causes the host-level overhead to remain high.

But the patch is still quite useful as it does reduce the excessive
VMEXITs caused by excessive wrmsr to 0x38f.

Thanks.
-Mingwei
> >
> > > ---
> > >  include/linux/perf_event.h |  1 +
> > >  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
> > >  2 files changed, 43 insertions(+), 26 deletions(-)
> > >
> > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > > index d2a15c0c6f8a..b2ff60fa487e 100644
> > > --- a/include/linux/perf_event.h
> > > +++ b/include/linux/perf_event.h
> > > @@ -883,6 +883,7 @@ struct perf_event_pmu_context {
> > >
> > >       unsigned int                    nr_events;
> > >       unsigned int                    nr_cgroups;
> > > +     unsigned int                    nr_freq;
> > >
> > >       atomic_t                        refcount; /* event <-> epc */
> > >       struct rcu_head                 rcu_head;
> > > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > > index 59b332cce9e7..ce9db9dbfd4c 100644
> > > --- a/kernel/events/core.c
> > > +++ b/kernel/events/core.c
> > > @@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> > >
> > >       if (!is_software_event(event))
> > >               cpc->active_oncpu--;
> > > -     if (event->attr.freq && event->attr.sample_freq)
> > > +     if (event->attr.freq && event->attr.sample_freq) {
> > >               ctx->nr_freq--;
> > > +             epc->nr_freq--;
> > > +     }
> > >       if (event->attr.exclusive || !cpc->active_oncpu)
> > >               cpc->exclusive = 0;
> > >
> > > @@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> > >
> > >       if (!is_software_event(event))
> > >               cpc->active_oncpu++;
> > > -     if (event->attr.freq && event->attr.sample_freq)
> > > +     if (event->attr.freq && event->attr.sample_freq) {
> > >               ctx->nr_freq++;
> > > -
> > > +             epc->nr_freq++;
> > > +     }
> > >       if (event->attr.exclusive)
> > >               cpc->exclusive = 1;
> > >
> > > @@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
> > >       }
> > >  }
> > >
> > > -/*
> > > - * combine freq adjustment with unthrottling to avoid two passes over the
> > > - * events. At the same time, make sure, having freq events does not change
> > > - * the rate of unthrottling as that would introduce bias.
> > > - */
> > > -static void
> > > -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
> > >  {
> > >       struct perf_event *event;
> > >       struct hw_perf_event *hwc;
> > >       u64 now, period = TICK_NSEC;
> > >       s64 delta;
> > >
> > > -     /*
> > > -      * only need to iterate over all events iff:
> > > -      * - context have events in frequency mode (needs freq adjust)
> > > -      * - there are events to unthrottle on this cpu
> > > -      */
> > > -     if (!(ctx->nr_freq || unthrottle))
> > > -             return;
> > > -
> > > -     raw_spin_lock(&ctx->lock);
> > > -
> > > -     list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> > > +     list_for_each_entry(event, event_list, active_list) {
> > >               if (event->state != PERF_EVENT_STATE_ACTIVE)
> > >                       continue;
> > >
> > > @@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > >               if (!event_filter_match(event))
> > >                       continue;
> > >
> > > -             perf_pmu_disable(event->pmu);
> > > -
> > >               hwc = &event->hw;
> > >
> > >               if (hwc->interrupts == MAX_INTERRUPTS) {
> > > @@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > >               }
> > >
> > >               if (!event->attr.freq || !event->attr.sample_freq)
> > > -                     goto next;
> > > +                     continue;
> > >
> > >               /*
> > >                * stop the event and update event->count
> > > @@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > >                       perf_adjust_period(event, period, delta, false);
> > >
> > >               event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> > > -     next:
> > > -             perf_pmu_enable(event->pmu);
> > > +     }
> > > +}
> > > +
> > > +/*
> > > + * combine freq adjustment with unthrottling to avoid two passes over the
> > > + * events. At the same time, make sure, having freq events does not change
> > > + * the rate of unthrottling as that would introduce bias.
> > > + */
> > > +static void
> > > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > +{
> > > +     struct perf_event_pmu_context *pmu_ctx;
> > > +
> > > +     /*
> > > +      * only need to iterate over all events iff:
> > > +      * - context have events in frequency mode (needs freq adjust)
> > > +      * - there are events to unthrottle on this cpu
> > > +      */
> > > +     if (!(ctx->nr_freq || unthrottle))
> > > +             return;
> > > +
> > > +     raw_spin_lock(&ctx->lock);
> > > +
> > > +     list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > > +             if (!(pmu_ctx->nr_freq || unthrottle))
> > > +                     continue;
> > > +             if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > > +                     continue;
> > > +
> > > +             perf_pmu_disable(pmu_ctx->pmu);
> > > +             perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > > +             perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > > +             perf_pmu_enable(pmu_ctx->pmu);
> > >       }
> >
> > I don't think this is correct for big.LITTLE/hybrid systems.
> >
> > Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
> > events for both pmu_a and pmu_b. The perf_event_context for that task will have
> > a perf_event_pmu_context for each PMU in its pmu_ctx_list.
> >
> > Say that task is run on CPU0, and perf_event_task_tick() is called. That will
> > call perf_adjust_freq_unthr_context(), and it will iterate over the
> > pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
> > we'll go ahead and call the following for all of the pmu contexts in the
> > pmu_ctx_list:
> >
> >         perf_pmu_disable(pmu_ctx->pmu);
> >         perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> >         perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> >         perf_pmu_enable(pmu_ctx->pmu);
> >
> > ... and that means we might call that for pmu_b, even though it's not
> > associated with CPU0. That could be fatal depending on what those callbacks do.
>
> Thanks for pointing that out.  I missed the hybrid cases.
>
> >
> > The old logic avoided that possibility implicitly, since the events for pmu_b
> > couldn't be active, and so the check at the start of the look would skip all of
> > pmu_b's events:
> >
> >         if (event->state != PERF_EVENT_STATE_ACTIVE)
> >                 continue;
> >
> > We could do similar by keeping track of how many active events each
> > perf_event_pmu_context has, which'd allow us to do something like:
> >
> >         if (pmu_ctx->nr_active == 0)
> >                 continue;
> >
> > How does that sound to you?
>
> Sounds good.  Maybe we can just test if both active lists are empty.
>
> Thanks,
> Namhyung

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()
  2024-01-10 18:27   ` Namhyung Kim
  2024-01-10 18:44     ` Mingwei Zhang
@ 2024-01-11 10:41     ` Mark Rutland
  1 sibling, 0 replies; 6+ messages in thread
From: Mark Rutland @ 2024-01-11 10:41 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Mingwei Zhang, Ian Rogers,
	Kan Liang

On Wed, Jan 10, 2024 at 10:27:27AM -0800, Namhyung Kim wrote:
> On Wed, Jan 10, 2024 at 6:49 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> > > It was unnecessarily disabling and enabling PMUs for each event.  It
> > > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > > at each PMU.  As pmu context has separate active lists for pinned group
> > > and flexible group, factor out a new function to do the job.
> > >
> > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > even if it needs to unthrottle sampling events.
> > >
> > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
> > > Tested-by: Mingwei Zhang <mizhang@google.com>
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> >
> > Hi,
> >
> > I've taken a quick look and I don't think this is quite right for
> > hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
> > that below).
> 
> Thanks for your review!

No problem!

> > This seems to be a bunch of optimizations; was that based on inspection alone,
> > or have you found a workload where this has a measureable impact?
> 
> It's from a code inspection but I think Mingwei reported some excessive
> MSR accesses for KVM use cases.  Anyway it'd increase the interrupt \
> latency if you have slow (uncore) PMUs and lots of events on those PMUs.

Makes sense; it would be good if we could put smoething in the commit message
mentioning that.

[...]

> > > +static void
> > > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > +{
> > > +     struct perf_event_pmu_context *pmu_ctx;
> > > +
> > > +     /*
> > > +      * only need to iterate over all events iff:
> > > +      * - context have events in frequency mode (needs freq adjust)
> > > +      * - there are events to unthrottle on this cpu
> > > +      */
> > > +     if (!(ctx->nr_freq || unthrottle))
> > > +             return;
> > > +
> > > +     raw_spin_lock(&ctx->lock);
> > > +
> > > +     list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > > +             if (!(pmu_ctx->nr_freq || unthrottle))
> > > +                     continue;
> > > +             if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > > +                     continue;
> > > +
> > > +             perf_pmu_disable(pmu_ctx->pmu);
> > > +             perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > > +             perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > > +             perf_pmu_enable(pmu_ctx->pmu);
> > >       }
> >
> > I don't think this is correct for big.LITTLE/hybrid systems.
> >
> > Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
> > events for both pmu_a and pmu_b. The perf_event_context for that task will have
> > a perf_event_pmu_context for each PMU in its pmu_ctx_list.
> >
> > Say that task is run on CPU0, and perf_event_task_tick() is called. That will
> > call perf_adjust_freq_unthr_context(), and it will iterate over the
> > pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
> > we'll go ahead and call the following for all of the pmu contexts in the
> > pmu_ctx_list:
> >
> >         perf_pmu_disable(pmu_ctx->pmu);
> >         perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> >         perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> >         perf_pmu_enable(pmu_ctx->pmu);
> >
> > ... and that means we might call that for pmu_b, even though it's not
> > associated with CPU0. That could be fatal depending on what those callbacks do.
> 
> Thanks for pointing that out.  I missed the hybrid cases.
> 
> > The old logic avoided that possibility implicitly, since the events for pmu_b
> > couldn't be active, and so the check at the start of the look would skip all of
> > pmu_b's events:
> >
> >         if (event->state != PERF_EVENT_STATE_ACTIVE)
> >                 continue;
> >
> > We could do similar by keeping track of how many active events each
> > perf_event_pmu_context has, which'd allow us to do something like:
> >
> >         if (pmu_ctx->nr_active == 0)
> >                 continue;
> >
> > How does that sound to you?
> 
> Sounds good.  Maybe we can just test if both active lists are empty.

Good idea, I think that'd be simpler and less fragile.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-01-11 10:41 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-09 21:36 [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
2024-01-09 21:36 ` [PATCH RESEND 2/2] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
2024-01-10 14:49 ` [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context() Mark Rutland
2024-01-10 18:27   ` Namhyung Kim
2024-01-10 18:44     ` Mingwei Zhang
2024-01-11 10:41     ` Mark Rutland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox