[PATCH 1/3] perf/core: Update perf_adjust_freq_unthr

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
@ 2023-11-20 22:19 Namhyung Kim
  2023-11-20 22:19 ` [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Namhyung Kim @ 2023-11-20 22:19 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Kan Liang, Mingwei Zhang

It was unnecessarily disabling and enabling PMUs for each event.  It
should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
at each PMU.  As pmu context has separate active lists for pinned group
and flexible group, factor out a new function to do the job.

Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
even if it needs to unthrottle sampling events.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0367d748fae0..3eb17dc89f5e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -879,6 +879,7 @@ struct perf_event_pmu_context {
 
 	unsigned int			nr_events;
 	unsigned int			nr_cgroups;
+	unsigned int			nr_freq;
 
 	atomic_t			refcount; /* event <-> epc */
 	struct rcu_head			rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3eb26c2c6e65..53e2ad73102d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2275,8 +2275,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
 
 	if (!is_software_event(event))
 		cpc->active_oncpu--;
-	if (event->attr.freq && event->attr.sample_freq)
+	if (event->attr.freq && event->attr.sample_freq) {
 		ctx->nr_freq--;
+		epc->nr_freq--;
+	}
 	if (event->attr.exclusive || !cpc->active_oncpu)
 		cpc->exclusive = 0;
 
@@ -2531,9 +2533,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
 
 	if (!is_software_event(event))
 		cpc->active_oncpu++;
-	if (event->attr.freq && event->attr.sample_freq)
+	if (event->attr.freq && event->attr.sample_freq) {
 		ctx->nr_freq++;
-
+		epc->nr_freq++;
+	}
 	if (event->attr.exclusive)
 		cpc->exclusive = 1;
 
@@ -4096,30 +4099,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
 	}
 }
 
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void
-perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	u64 now, period = TICK_NSEC;
 	s64 delta;
 
-	/*
-	 * only need to iterate over all events iff:
-	 * - context have events in frequency mode (needs freq adjust)
-	 * - there are events to unthrottle on this cpu
-	 */
-	if (!(ctx->nr_freq || unthrottle))
-		return;
-
-	raw_spin_lock(&ctx->lock);
-
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+	list_for_each_entry(event, event_list, active_list) {
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
@@ -4127,8 +4114,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 		if (!event_filter_match(event))
 			continue;
 
-		perf_pmu_disable(event->pmu);
-
 		hwc = &event->hw;
 
 		if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -4138,7 +4123,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
-			goto next;
+			continue;
 
 		/*
 		 * stop the event and update event->count
@@ -4160,8 +4145,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 			perf_adjust_period(event, period, delta, false);
 
 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
-	next:
-		perf_pmu_enable(event->pmu);
+	}
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	/*
+	 * only need to iterate over all events iff:
+	 * - context have events in frequency mode (needs freq adjust)
+	 * - there are events to unthrottle on this cpu
+	 */
+	if (!(ctx->nr_freq || unthrottle))
+		return;
+
+	raw_spin_lock(&ctx->lock);
+
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (!(pmu_ctx->nr_freq || unthrottle))
+			continue;
+		if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+			continue;
+
+		perf_pmu_disable(pmu_ctx->pmu);
+		perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+		perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+		perf_pmu_enable(pmu_ctx->pmu);
 	}
 
 	raw_spin_unlock(&ctx->lock);
-- 
2.43.0.rc1.413.gea7ed67945-goog


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq
  2023-11-20 22:19 [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
@ 2023-11-20 22:19 ` Namhyung Kim
  2023-11-21 15:57   ` Liang, Kan
  2023-11-20 22:19 ` [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs Namhyung Kim
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Namhyung Kim @ 2023-11-20 22:19 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Kan Liang, Mingwei Zhang

For throttled events, it first starts the event and then stop
unnecessarily.  As it's already stopped, it can directly adjust
the frequency and then move on.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 53e2ad73102d..fd3449e4d081 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4119,10 +4119,15 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		if (hwc->interrupts == MAX_INTERRUPTS) {
 			hwc->interrupts = 0;
 			perf_log_throttle(event, 1);
-			event->pmu->start(event, 0);
-		}
 
-		if (!event->attr.freq || !event->attr.sample_freq)
+			if (!event->attr.freq || !event->attr.sample_freq) {
+				delta = 0;
+				goto next;
+			}
+
+			if (event->hw.state & PERF_HES_STOPPED)
+				goto adjust;
+		} else if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
 		/*
@@ -4130,6 +4135,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		 */
 		event->pmu->stop(event, PERF_EF_UPDATE);
 
+adjust:
 		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
@@ -4144,6 +4150,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
 		if (delta > 0)
 			perf_adjust_period(event, period, delta, false);
 
+next:
 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
 	}
 }
-- 
2.43.0.rc1.413.gea7ed67945-goog


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs
  2023-11-20 22:19 [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
  2023-11-20 22:19 ` [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
@ 2023-11-20 22:19 ` Namhyung Kim
  2023-11-21 15:59   ` Liang, Kan
  2023-11-20 22:41 ` [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Ian Rogers
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Namhyung Kim @ 2023-11-20 22:19 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Kan Liang, Mingwei Zhang

It doesn't support sampling in uncore PMU events.  While it's
technically possible to generate interrupts, let's treat it as if it
has no interrupt in order to skip the freq adjust/unthrottling logic
in the timer handler which is only meaningful to sampling events.

Also remove the sampling event check because it'd be done in the general
code in the perf_event_open syscall.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 arch/x86/events/intel/uncore.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 69043e02e8a7..f7e6228bd1b1 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -744,10 +744,6 @@ static int uncore_pmu_event_init(struct perf_event *event)
 	if (pmu->func_id < 0)
 		return -ENOENT;
 
-	/* Sampling not supported yet */
-	if (hwc->sample_period)
-		return -EINVAL;
-
 	/*
 	 * Place all uncore events for a particular physical package
 	 * onto a single cpu
@@ -919,7 +915,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 			.stop		= uncore_pmu_event_stop,
 			.read		= uncore_pmu_event_read,
 			.module		= THIS_MODULE,
-			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+			/*
+			 * It doesn't allow sampling for uncore events, let's
+			 * treat the PMU has no interrupts to skip them in the
+			 * perf_adjust_freq_unthr_context().
+			 */
+			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
 			.attr_update	= pmu->type->attr_update,
 		};
 	} else {
-- 
2.43.0.rc1.413.gea7ed67945-goog


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-20 22:19 [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
  2023-11-20 22:19 ` [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
  2023-11-20 22:19 ` [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs Namhyung Kim
@ 2023-11-20 22:41 ` Ian Rogers
  2023-11-20 23:23   ` Mingwei Zhang
  2023-11-21 15:57 ` Liang, Kan
  2023-12-02  6:16 ` Mingwei Zhang
  4 siblings, 1 reply; 15+ messages in thread
From: Ian Rogers @ 2023-11-20 22:41 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Kan Liang, Mingwei Zhang

On Mon, Nov 20, 2023 at 2:19 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> It was unnecessarily disabling and enabling PMUs for each event.  It
> should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> at each PMU.  As pmu context has separate active lists for pinned group
> and flexible group, factor out a new function to do the job.
>
> Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> even if it needs to unthrottle sampling events.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Series:
Reviewed-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

> ---
>  include/linux/perf_event.h |  1 +
>  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
>  2 files changed, 43 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 0367d748fae0..3eb17dc89f5e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -879,6 +879,7 @@ struct perf_event_pmu_context {
>
>         unsigned int                    nr_events;
>         unsigned int                    nr_cgroups;
> +       unsigned int                    nr_freq;
>
>         atomic_t                        refcount; /* event <-> epc */
>         struct rcu_head                 rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 3eb26c2c6e65..53e2ad73102d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2275,8 +2275,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
>
>         if (!is_software_event(event))
>                 cpc->active_oncpu--;
> -       if (event->attr.freq && event->attr.sample_freq)
> +       if (event->attr.freq && event->attr.sample_freq) {
>                 ctx->nr_freq--;
> +               epc->nr_freq--;
> +       }
>         if (event->attr.exclusive || !cpc->active_oncpu)
>                 cpc->exclusive = 0;
>
> @@ -2531,9 +2533,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>
>         if (!is_software_event(event))
>                 cpc->active_oncpu++;
> -       if (event->attr.freq && event->attr.sample_freq)
> +       if (event->attr.freq && event->attr.sample_freq) {
>                 ctx->nr_freq++;
> -
> +               epc->nr_freq++;
> +       }
>         if (event->attr.exclusive)
>                 cpc->exclusive = 1;
>
> @@ -4096,30 +4099,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
>         }
>  }
>
> -/*
> - * combine freq adjustment with unthrottling to avoid two passes over the
> - * events. At the same time, make sure, having freq events does not change
> - * the rate of unthrottling as that would introduce bias.
> - */
> -static void
> -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  {
>         struct perf_event *event;
>         struct hw_perf_event *hwc;
>         u64 now, period = TICK_NSEC;
>         s64 delta;
>
> -       /*
> -        * only need to iterate over all events iff:
> -        * - context have events in frequency mode (needs freq adjust)
> -        * - there are events to unthrottle on this cpu
> -        */
> -       if (!(ctx->nr_freq || unthrottle))
> -               return;
> -
> -       raw_spin_lock(&ctx->lock);
> -
> -       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> +       list_for_each_entry(event, event_list, active_list) {
>                 if (event->state != PERF_EVENT_STATE_ACTIVE)
>                         continue;
>
> @@ -4127,8 +4114,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>                 if (!event_filter_match(event))
>                         continue;
>
> -               perf_pmu_disable(event->pmu);
> -
>                 hwc = &event->hw;
>
>                 if (hwc->interrupts == MAX_INTERRUPTS) {
> @@ -4138,7 +4123,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>                 }
>
>                 if (!event->attr.freq || !event->attr.sample_freq)
> -                       goto next;
> +                       continue;
>
>                 /*
>                  * stop the event and update event->count
> @@ -4160,8 +4145,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>                         perf_adjust_period(event, period, delta, false);
>
>                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> -       next:
> -               perf_pmu_enable(event->pmu);
> +       }
> +}
> +
> +/*
> + * combine freq adjustment with unthrottling to avoid two passes over the
> + * events. At the same time, make sure, having freq events does not change
> + * the rate of unthrottling as that would introduce bias.
> + */
> +static void
> +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +{
> +       struct perf_event_pmu_context *pmu_ctx;
> +
> +       /*
> +        * only need to iterate over all events iff:
> +        * - context have events in frequency mode (needs freq adjust)
> +        * - there are events to unthrottle on this cpu
> +        */
> +       if (!(ctx->nr_freq || unthrottle))
> +               return;
> +
> +       raw_spin_lock(&ctx->lock);
> +
> +       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +               if (!(pmu_ctx->nr_freq || unthrottle))
> +                       continue;
> +               if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> +                       continue;
> +
> +               perf_pmu_disable(pmu_ctx->pmu);
> +               perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> +               perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> +               perf_pmu_enable(pmu_ctx->pmu);
>         }
>
>         raw_spin_unlock(&ctx->lock);
> --
> 2.43.0.rc1.413.gea7ed67945-goog
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-20 22:41 ` [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Ian Rogers
@ 2023-11-20 23:23   ` Mingwei Zhang
  2023-11-21 18:21     ` Namhyung Kim
  0 siblings, 1 reply; 15+ messages in thread
From: Mingwei Zhang @ 2023-11-20 23:23 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Namhyung Kim, Peter Zijlstra, Ingo Molnar, Mark Rutland,
	Alexander Shishkin, Arnaldo Carvalho de Melo, LKML, Kan Liang

On Mon, Nov 20, 2023, Ian Rogers wrote:
> On Mon, Nov 20, 2023 at 2:19 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > It was unnecessarily disabling and enabling PMUs for each event.  It
> > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > at each PMU.  As pmu context has separate active lists for pinned group
> > and flexible group, factor out a new function to do the job.
> >
> > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > even if it needs to unthrottle sampling events.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> 
> Series:
> Reviewed-by: Ian Rogers <irogers@google.com>
> 
> Thanks,
> Ian
> 

Can we have "Cc: stable@vger.kernel.org" for the whole series? This
series should have a great performance improvement for all VMs in which
perf sampling events without specifying period.

The key point behind is that disabling/enabling PMU in virtualized
environment is super heavyweight which can reaches up to 50% of the CPU
time, ie., When multiplxing is used in the VM, a vCPU on a pCPU can only
use 50% of the resource, the other half was entirely wasted in host PMU
code doing the enabling/disabling PMU.

Thanks.
-Mingwei

> > ---
> >  include/linux/perf_event.h |  1 +
> >  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
> >  2 files changed, 43 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index 0367d748fae0..3eb17dc89f5e 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -879,6 +879,7 @@ struct perf_event_pmu_context {
> >
> >         unsigned int                    nr_events;
> >         unsigned int                    nr_cgroups;
> > +       unsigned int                    nr_freq;
> >
> >         atomic_t                        refcount; /* event <-> epc */
> >         struct rcu_head                 rcu_head;
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 3eb26c2c6e65..53e2ad73102d 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -2275,8 +2275,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> >
> >         if (!is_software_event(event))
> >                 cpc->active_oncpu--;
> > -       if (event->attr.freq && event->attr.sample_freq)
> > +       if (event->attr.freq && event->attr.sample_freq) {
> >                 ctx->nr_freq--;
> > +               epc->nr_freq--;
> > +       }
> >         if (event->attr.exclusive || !cpc->active_oncpu)
> >                 cpc->exclusive = 0;
> >
> > @@ -2531,9 +2533,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> >
> >         if (!is_software_event(event))
> >                 cpc->active_oncpu++;
> > -       if (event->attr.freq && event->attr.sample_freq)
> > +       if (event->attr.freq && event->attr.sample_freq) {
> >                 ctx->nr_freq++;
> > -
> > +               epc->nr_freq++;
> > +       }
> >         if (event->attr.exclusive)
> >                 cpc->exclusive = 1;
> >
> > @@ -4096,30 +4099,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
> >         }
> >  }
> >
> > -/*
> > - * combine freq adjustment with unthrottling to avoid two passes over the
> > - * events. At the same time, make sure, having freq events does not change
> > - * the rate of unthrottling as that would introduce bias.
> > - */
> > -static void
> > -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
> >  {
> >         struct perf_event *event;
> >         struct hw_perf_event *hwc;
> >         u64 now, period = TICK_NSEC;
> >         s64 delta;
> >
> > -       /*
> > -        * only need to iterate over all events iff:
> > -        * - context have events in frequency mode (needs freq adjust)
> > -        * - there are events to unthrottle on this cpu
> > -        */
> > -       if (!(ctx->nr_freq || unthrottle))
> > -               return;
> > -
> > -       raw_spin_lock(&ctx->lock);
> > -
> > -       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> > +       list_for_each_entry(event, event_list, active_list) {
> >                 if (event->state != PERF_EVENT_STATE_ACTIVE)
> >                         continue;
> >
> > @@ -4127,8 +4114,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> >                 if (!event_filter_match(event))
> >                         continue;
> >
> > -               perf_pmu_disable(event->pmu);
> > -
> >                 hwc = &event->hw;
> >
> >                 if (hwc->interrupts == MAX_INTERRUPTS) {
> > @@ -4138,7 +4123,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> >                 }
> >
> >                 if (!event->attr.freq || !event->attr.sample_freq)
> > -                       goto next;
> > +                       continue;
> >
> >                 /*
> >                  * stop the event and update event->count
> > @@ -4160,8 +4145,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> >                         perf_adjust_period(event, period, delta, false);
> >
> >                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> > -       next:
> > -               perf_pmu_enable(event->pmu);
> > +       }
> > +}
> > +
> > +/*
> > + * combine freq adjustment with unthrottling to avoid two passes over the
> > + * events. At the same time, make sure, having freq events does not change
> > + * the rate of unthrottling as that would introduce bias.
> > + */
> > +static void
> > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > +{
> > +       struct perf_event_pmu_context *pmu_ctx;
> > +
> > +       /*
> > +        * only need to iterate over all events iff:
> > +        * - context have events in frequency mode (needs freq adjust)
> > +        * - there are events to unthrottle on this cpu
> > +        */
> > +       if (!(ctx->nr_freq || unthrottle))
> > +               return;
> > +
> > +       raw_spin_lock(&ctx->lock);
> > +
> > +       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > +               if (!(pmu_ctx->nr_freq || unthrottle))
> > +                       continue;
> > +               if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > +                       continue;
> > +
> > +               perf_pmu_disable(pmu_ctx->pmu);
> > +               perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > +               perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > +               perf_pmu_enable(pmu_ctx->pmu);
> >         }
> >
> >         raw_spin_unlock(&ctx->lock);
> > --
> > 2.43.0.rc1.413.gea7ed67945-goog
> >

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-20 22:19 [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
                   ` (2 preceding siblings ...)
  2023-11-20 22:41 ` [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Ian Rogers
@ 2023-11-21 15:57 ` Liang, Kan
  2023-12-02  6:16 ` Mingwei Zhang
  4 siblings, 0 replies; 15+ messages in thread
From: Liang, Kan @ 2023-11-21 15:57 UTC (permalink / raw)
  To: Namhyung Kim, Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Mingwei Zhang



On 2023-11-20 5:19 p.m., Namhyung Kim wrote:
> It was unnecessarily disabling and enabling PMUs for each event.  It
> should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> at each PMU.  As pmu context has separate active lists for pinned group
> and flexible group, factor out a new function to do the job.
> 
> Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> even if it needs to unthrottle sampling events.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>

Thanks,
Kan

> ---
>  include/linux/perf_event.h |  1 +
>  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
>  2 files changed, 43 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 0367d748fae0..3eb17dc89f5e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -879,6 +879,7 @@ struct perf_event_pmu_context {
>  
>  	unsigned int			nr_events;
>  	unsigned int			nr_cgroups;
> +	unsigned int			nr_freq;
>  
>  	atomic_t			refcount; /* event <-> epc */
>  	struct rcu_head			rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 3eb26c2c6e65..53e2ad73102d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2275,8 +2275,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
>  
>  	if (!is_software_event(event))
>  		cpc->active_oncpu--;
> -	if (event->attr.freq && event->attr.sample_freq)
> +	if (event->attr.freq && event->attr.sample_freq) {
>  		ctx->nr_freq--;
> +		epc->nr_freq--;
> +	}
>  	if (event->attr.exclusive || !cpc->active_oncpu)
>  		cpc->exclusive = 0;
>  
> @@ -2531,9 +2533,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>  
>  	if (!is_software_event(event))
>  		cpc->active_oncpu++;
> -	if (event->attr.freq && event->attr.sample_freq)
> +	if (event->attr.freq && event->attr.sample_freq) {
>  		ctx->nr_freq++;
> -
> +		epc->nr_freq++;
> +	}
>  	if (event->attr.exclusive)
>  		cpc->exclusive = 1;
>  
> @@ -4096,30 +4099,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
>  	}
>  }
>  
> -/*
> - * combine freq adjustment with unthrottling to avoid two passes over the
> - * events. At the same time, make sure, having freq events does not change
> - * the rate of unthrottling as that would introduce bias.
> - */
> -static void
> -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  {
>  	struct perf_event *event;
>  	struct hw_perf_event *hwc;
>  	u64 now, period = TICK_NSEC;
>  	s64 delta;
>  
> -	/*
> -	 * only need to iterate over all events iff:
> -	 * - context have events in frequency mode (needs freq adjust)
> -	 * - there are events to unthrottle on this cpu
> -	 */
> -	if (!(ctx->nr_freq || unthrottle))
> -		return;
> -
> -	raw_spin_lock(&ctx->lock);
> -
> -	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> +	list_for_each_entry(event, event_list, active_list) {
>  		if (event->state != PERF_EVENT_STATE_ACTIVE)
>  			continue;
>  
> @@ -4127,8 +4114,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  		if (!event_filter_match(event))
>  			continue;
>  
> -		perf_pmu_disable(event->pmu);
> -
>  		hwc = &event->hw;
>  
>  		if (hwc->interrupts == MAX_INTERRUPTS) {
> @@ -4138,7 +4123,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  		}
>  
>  		if (!event->attr.freq || !event->attr.sample_freq)
> -			goto next;
> +			continue;
>  
>  		/*
>  		 * stop the event and update event->count
> @@ -4160,8 +4145,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  			perf_adjust_period(event, period, delta, false);
>  
>  		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> -	next:
> -		perf_pmu_enable(event->pmu);
> +	}
> +}
> +
> +/*
> + * combine freq adjustment with unthrottling to avoid two passes over the
> + * events. At the same time, make sure, having freq events does not change
> + * the rate of unthrottling as that would introduce bias.
> + */
> +static void
> +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	/*
> +	 * only need to iterate over all events iff:
> +	 * - context have events in frequency mode (needs freq adjust)
> +	 * - there are events to unthrottle on this cpu
> +	 */
> +	if (!(ctx->nr_freq || unthrottle))
> +		return;
> +
> +	raw_spin_lock(&ctx->lock);
> +
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		if (!(pmu_ctx->nr_freq || unthrottle))
> +			continue;
> +		if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> +			continue;
> +
> +		perf_pmu_disable(pmu_ctx->pmu);
> +		perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> +		perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> +		perf_pmu_enable(pmu_ctx->pmu);
>  	}
>  
>  	raw_spin_unlock(&ctx->lock);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq
  2023-11-20 22:19 ` [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
@ 2023-11-21 15:57   ` Liang, Kan
  0 siblings, 0 replies; 15+ messages in thread
From: Liang, Kan @ 2023-11-21 15:57 UTC (permalink / raw)
  To: Namhyung Kim, Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Mingwei Zhang



On 2023-11-20 5:19 p.m., Namhyung Kim wrote:
> For throttled events, it first starts the event and then stop
> unnecessarily.  As it's already stopped, it can directly adjust
> the frequency and then move on.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>

Thanks,
Kan

> ---
>  kernel/events/core.c | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 53e2ad73102d..fd3449e4d081 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4119,10 +4119,15 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  		if (hwc->interrupts == MAX_INTERRUPTS) {
>  			hwc->interrupts = 0;
>  			perf_log_throttle(event, 1);
> -			event->pmu->start(event, 0);
> -		}
>  
> -		if (!event->attr.freq || !event->attr.sample_freq)
> +			if (!event->attr.freq || !event->attr.sample_freq) {
> +				delta = 0;
> +				goto next;
> +			}
> +
> +			if (event->hw.state & PERF_HES_STOPPED)
> +				goto adjust;
> +		} else if (!event->attr.freq || !event->attr.sample_freq)
>  			continue;
>  
>  		/*
> @@ -4130,6 +4135,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  		 */
>  		event->pmu->stop(event, PERF_EF_UPDATE);
>  
> +adjust:
>  		now = local64_read(&event->count);
>  		delta = now - hwc->freq_count_stamp;
>  		hwc->freq_count_stamp = now;
> @@ -4144,6 +4150,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  		if (delta > 0)
>  			perf_adjust_period(event, period, delta, false);
>  
> +next:
>  		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
>  	}
>  }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs
  2023-11-20 22:19 ` [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs Namhyung Kim
@ 2023-11-21 15:59   ` Liang, Kan
  2023-11-21 18:30     ` Namhyung Kim
  0 siblings, 1 reply; 15+ messages in thread
From: Liang, Kan @ 2023-11-21 15:59 UTC (permalink / raw)
  To: Namhyung Kim, Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Mingwei Zhang



On 2023-11-20 5:19 p.m., Namhyung Kim wrote:
> It doesn't support sampling in uncore PMU events.  While it's
> technically possible to generate interrupts, let's treat it as if it
> has no interrupt in order to skip the freq adjust/unthrottling logic
> in the timer handler which is only meaningful to sampling events.
> 
> Also remove the sampling event check because it'd be done in the general
> code in the perf_event_open syscall.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  arch/x86/events/intel/uncore.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
> index 69043e02e8a7..f7e6228bd1b1 100644
> --- a/arch/x86/events/intel/uncore.c
> +++ b/arch/x86/events/intel/uncore.c
> @@ -744,10 +744,6 @@ static int uncore_pmu_event_init(struct perf_event *event)
>  	if (pmu->func_id < 0)
>  		return -ENOENT;
>  
> -	/* Sampling not supported yet */
> -	if (hwc->sample_period)
> -		return -EINVAL;
> -
>  	/*
>  	 * Place all uncore events for a particular physical package
>  	 * onto a single cpu
> @@ -919,7 +915,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
>  			.stop		= uncore_pmu_event_stop,
>  			.read		= uncore_pmu_event_read,
>  			.module		= THIS_MODULE,
> -			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +			/*
> +			 * It doesn't allow sampling for uncore events, let's
> +			 * treat the PMU has no interrupts to skip them in the
> +			 * perf_adjust_freq_unthr_context().
> +			 */
> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
>  			.attr_update	= pmu->type->attr_update,
>  		};


There is a special customized uncore PMU which needs the flag as well.

diff --git a/arch/x86/events/intel/uncore_snb.c
b/arch/x86/events/intel/uncore_snb.c
index 7fd4334e12a1..46a63e291975 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1013,7 +1013,7 @@ static struct pmu snb_uncore_imc_pmu = {
        .start          = uncore_pmu_event_start,
        .stop           = uncore_pmu_event_stop,
        .read           = uncore_pmu_event_read,
-       .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
+       .capabilities   = PERF_PMU_CAP_NO_EXCLUDE |
PERF_PMU_CAP_NO_INTERRUPT,
 };

 static struct intel_uncore_ops snb_uncore_imc_ops = {


Thanks,
Kan
>  	} else {

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-20 23:23   ` Mingwei Zhang
@ 2023-11-21 18:21     ` Namhyung Kim
  2023-11-21 23:01       ` Mingwei Zhang
  0 siblings, 1 reply; 15+ messages in thread
From: Namhyung Kim @ 2023-11-21 18:21 UTC (permalink / raw)
  To: Mingwei Zhang
  Cc: Ian Rogers, Peter Zijlstra, Ingo Molnar, Mark Rutland,
	Alexander Shishkin, Arnaldo Carvalho de Melo, LKML, Kan Liang

Hi Mingwei,

On Mon, Nov 20, 2023 at 3:24 PM Mingwei Zhang <mizhang@google.com> wrote:
>
> On Mon, Nov 20, 2023, Ian Rogers wrote:
> > On Mon, Nov 20, 2023 at 2:19 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > It was unnecessarily disabling and enabling PMUs for each event.  It
> > > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > > at each PMU.  As pmu context has separate active lists for pinned group
> > > and flexible group, factor out a new function to do the job.
> > >
> > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > even if it needs to unthrottle sampling events.
> > >
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> >
> > Series:
> > Reviewed-by: Ian Rogers <irogers@google.com>
> >
> > Thanks,
> > Ian
> >
>
> Can we have "Cc: stable@vger.kernel.org" for the whole series? This
> series should have a great performance improvement for all VMs in which
> perf sampling events without specifying period.

I was not sure if it's ok to have this performance fix in the stable series.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs
  2023-11-21 15:59   ` Liang, Kan
@ 2023-11-21 18:30     ` Namhyung Kim
  2023-11-21 19:26       ` Liang, Kan
  0 siblings, 1 reply; 15+ messages in thread
From: Namhyung Kim @ 2023-11-21 18:30 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Ian Rogers, Mingwei Zhang

Hi Kan,

On Tue, Nov 21, 2023 at 7:59 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2023-11-20 5:19 p.m., Namhyung Kim wrote:
> > It doesn't support sampling in uncore PMU events.  While it's
> > technically possible to generate interrupts, let's treat it as if it
> > has no interrupt in order to skip the freq adjust/unthrottling logic
> > in the timer handler which is only meaningful to sampling events.
> >
> > Also remove the sampling event check because it'd be done in the general
> > code in the perf_event_open syscall.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  arch/x86/events/intel/uncore.c | 11 ++++++-----
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
> > index 69043e02e8a7..f7e6228bd1b1 100644
> > --- a/arch/x86/events/intel/uncore.c
> > +++ b/arch/x86/events/intel/uncore.c
> > @@ -744,10 +744,6 @@ static int uncore_pmu_event_init(struct perf_event *event)
> >       if (pmu->func_id < 0)
> >               return -ENOENT;
> >
> > -     /* Sampling not supported yet */
> > -     if (hwc->sample_period)
> > -             return -EINVAL;
> > -
> >       /*
> >        * Place all uncore events for a particular physical package
> >        * onto a single cpu
> > @@ -919,7 +915,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
> >                       .stop           = uncore_pmu_event_stop,
> >                       .read           = uncore_pmu_event_read,
> >                       .module         = THIS_MODULE,
> > -                     .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
> > +                     /*
> > +                      * It doesn't allow sampling for uncore events, let's
> > +                      * treat the PMU has no interrupts to skip them in the
> > +                      * perf_adjust_freq_unthr_context().
> > +                      */
> > +                     .capabilities   = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
> >                       .attr_update    = pmu->type->attr_update,
> >               };
>
>
> There is a special customized uncore PMU which needs the flag as well.

Ok, I will add that too.

Btw, during the work I noticed many PMU drivers didn't set the
CAP_NO_INTERRUPT flag even if they didn't support sampling and
rejected the sampling events manually in the ->event_init() callback.

I guess it's because the name of the flag is somewhat misleading.
As the PMU drivers handle IRQ (for overflows), they thought they had
interrupts and didn't set the flag.  I think it'd be better to rename it to
CAP_NO_SAMPLING to reveal the intention.  And then we could just set
the flag in the pmu.capabilities and remove the manual checks.

The benefit is it can skip the PMUs in the timer tick handler even if
it needs to unthrottle some events.  What do you think?

Thanks,
Namhyung

>
> diff --git a/arch/x86/events/intel/uncore_snb.c
> b/arch/x86/events/intel/uncore_snb.c
> index 7fd4334e12a1..46a63e291975 100644
> --- a/arch/x86/events/intel/uncore_snb.c
> +++ b/arch/x86/events/intel/uncore_snb.c
> @@ -1013,7 +1013,7 @@ static struct pmu snb_uncore_imc_pmu = {
>         .start          = uncore_pmu_event_start,
>         .stop           = uncore_pmu_event_stop,
>         .read           = uncore_pmu_event_read,
> -       .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
> +       .capabilities   = PERF_PMU_CAP_NO_EXCLUDE |
> PERF_PMU_CAP_NO_INTERRUPT,
>  };
>
>  static struct intel_uncore_ops snb_uncore_imc_ops = {
>
>
> Thanks,
> Kan
> >       } else {

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs
  2023-11-21 18:30     ` Namhyung Kim
@ 2023-11-21 19:26       ` Liang, Kan
  2023-12-01 20:29         ` Namhyung Kim
  0 siblings, 1 reply; 15+ messages in thread
From: Liang, Kan @ 2023-11-21 19:26 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Ian Rogers, Mingwei Zhang



On 2023-11-21 1:30 p.m., Namhyung Kim wrote:
> Hi Kan,
> 
> On Tue, Nov 21, 2023 at 7:59 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>
>>
>>
>> On 2023-11-20 5:19 p.m., Namhyung Kim wrote:
>>> It doesn't support sampling in uncore PMU events.  While it's
>>> technically possible to generate interrupts, let's treat it as if it
>>> has no interrupt in order to skip the freq adjust/unthrottling logic
>>> in the timer handler which is only meaningful to sampling events.
>>>
>>> Also remove the sampling event check because it'd be done in the general
>>> code in the perf_event_open syscall.
>>>
>>> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
>>> ---
>>>  arch/x86/events/intel/uncore.c | 11 ++++++-----
>>>  1 file changed, 6 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
>>> index 69043e02e8a7..f7e6228bd1b1 100644
>>> --- a/arch/x86/events/intel/uncore.c
>>> +++ b/arch/x86/events/intel/uncore.c
>>> @@ -744,10 +744,6 @@ static int uncore_pmu_event_init(struct perf_event *event)
>>>       if (pmu->func_id < 0)
>>>               return -ENOENT;
>>>
>>> -     /* Sampling not supported yet */
>>> -     if (hwc->sample_period)
>>> -             return -EINVAL;
>>> -
>>>       /*
>>>        * Place all uncore events for a particular physical package
>>>        * onto a single cpu
>>> @@ -919,7 +915,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
>>>                       .stop           = uncore_pmu_event_stop,
>>>                       .read           = uncore_pmu_event_read,
>>>                       .module         = THIS_MODULE,
>>> -                     .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
>>> +                     /*
>>> +                      * It doesn't allow sampling for uncore events, let's
>>> +                      * treat the PMU has no interrupts to skip them in the
>>> +                      * perf_adjust_freq_unthr_context().
>>> +                      */
>>> +                     .capabilities   = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
>>>                       .attr_update    = pmu->type->attr_update,
>>>               };
>>
>>
>> There is a special customized uncore PMU which needs the flag as well.
> 
> Ok, I will add that too.
> 
> Btw, during the work I noticed many PMU drivers didn't set the
> CAP_NO_INTERRUPT flag even if they didn't support sampling and
> rejected the sampling events manually in the ->event_init() callback.
> 
> I guess it's because the name of the flag is somewhat misleading.
> As the PMU drivers handle IRQ (for overflows), they thought they had
> interrupts and didn't set the flag.  I think it'd be better to rename it to
> CAP_NO_SAMPLING to reveal the intention.  And then we could just set
> the flag in the pmu.capabilities and remove the manual checks.
> 
> The benefit is it can skip the PMUs in the timer tick handler even if
> it needs to unthrottle some events.  What do you think?
> 

I agree. The current name is kind of misleading.

The patch, which introduced the flag (commit id 53b25335dd60 ("perf:
Disable sampled events if no PMU interrupt")), also tried to disable the
sampled events on a no-sampling supported platform.

The renaming sounds good to me.

Thanks,
Kan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-21 18:21     ` Namhyung Kim
@ 2023-11-21 23:01       ` Mingwei Zhang
  2023-11-24  0:50         ` Mingwei Zhang
  0 siblings, 1 reply; 15+ messages in thread
From: Mingwei Zhang @ 2023-11-21 23:01 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Peter Zijlstra, Ingo Molnar, Mark Rutland,
	Alexander Shishkin, Arnaldo Carvalho de Melo, LKML, Kan Liang

On Tue, Nov 21, 2023, Namhyung Kim wrote:

Hi Namhyung,

> Hi Mingwei,
> 
> On Mon, Nov 20, 2023 at 3:24 PM Mingwei Zhang <mizhang@google.com> wrote:
> >
> > On Mon, Nov 20, 2023, Ian Rogers wrote:
> > > On Mon, Nov 20, 2023 at 2:19 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > > >
> > > > It was unnecessarily disabling and enabling PMUs for each event.  It
> > > > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > > > at each PMU.  As pmu context has separate active lists for pinned group
> > > > and flexible group, factor out a new function to do the job.
> > > >
> > > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > > even if it needs to unthrottle sampling events.
> > > >
> > > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > >
> > > Series:
> > > Reviewed-by: Ian Rogers <irogers@google.com>
> > >
> > > Thanks,
> > > Ian
> > >
> >
> > Can we have "Cc: stable@vger.kernel.org" for the whole series? This
> > series should have a great performance improvement for all VMs in which
> > perf sampling events without specifying period.
> 
> I was not sure if it's ok to have this performance fix in the stable series.
> 

Critical performance bug fix is ok to be added to stable tree, as the
requirements are mentioned here:

https://www.kernel.org/doc/Documentation/process/stable-kernel-rules.rst

In particular, this patch satisfies the 2nd sub-bullet of the forth
bullet.

But let me step back. Only this patch is needed with stable tag instead
of the whole series. This patch impact 69 lines of code. It satisfies
the rule of within 100 lines (bullet 3).

I will give a try and test it today or tomorrow and make sure we satisfy
bullet 2.

Once it gets in, bullet 1 will be satisfied as well.

Overall, the intention is to improve PMU performance in VM as early as
we can since we don't control the schedule of distro kernel upgrade and
we don't control when end customers upgrade their running kernel. So I
presume even adding to the stable tree may take years to see the result
change. But if we don't do it, it may take way longer (since it does not
contain a "Fixes" tag as well).

Thanks.
-Mingwei

> Thanks,
> Namhyung

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-21 23:01       ` Mingwei Zhang
@ 2023-11-24  0:50         ` Mingwei Zhang
  0 siblings, 0 replies; 15+ messages in thread
From: Mingwei Zhang @ 2023-11-24  0:50 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Peter Zijlstra, Ingo Molnar, Mark Rutland,
	Alexander Shishkin, Arnaldo Carvalho de Melo, LKML, Kan Liang

On Tue, Nov 21, 2023, Mingwei Zhang wrote:
> On Tue, Nov 21, 2023, Namhyung Kim wrote:
> 
> 
> Hi Namhyung,
> 
> > Hi Mingwei,
> > 
> > On Mon, Nov 20, 2023 at 3:24 PM Mingwei Zhang <mizhang@google.com> wrote:
> > >
> > > On Mon, Nov 20, 2023, Ian Rogers wrote: > > > > On Mon, Nov 20, 2023 at 2:19 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > > > >
> > > > > It was unnecessarily disabling and enabling PMUs for each event.  It
> > > > > should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> > > > > at each PMU.  As pmu context has separate active lists for pinned group
> > > > > and flexible group, factor out a new function to do the job.
> > > > >
> > > > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > > > even if it needs to unthrottle sampling events.
> > > > >
> > > > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > >
> > > > Series:
> > > > Reviewed-by: Ian Rogers <irogers@google.com>
> > > >
> > > > Thanks,
> > > > Ian
> > > >
> > >
> > > Can we have "Cc: stable@vger.kernel.org" for the whole series? This
> > > series should have a great performance improvement for all VMs in which
> > > perf sampling events without specifying period.
> > 
> > I was not sure if it's ok to have this performance fix in the stable series.
> > 
> 
> Critical performance bug fix is ok to be added to stable tree, as the
> requirements are mentioned here:
> 
> https://www.kernel.org/doc/Documentation/process/stable-kernel-rules.rst
> 
> In particular, this patch satisfies the 2nd sub-bullet of the forth
> bullet.
> 
> But let me step back. Only this patch is needed with stable tag instead
> of the whole series. This patch impact 69 lines of code. It satisfies
> the rule of within 100 lines (bullet 3).
> 
> I will give a try and test it today or tomorrow and make sure we satisfy
> bullet 2.
> 
> Once it gets in, bullet 1 will be satisfied as well.
> 
> Overall, the intention is to improve PMU performance in VM as early as
> we can since we don't control the schedule of distro kernel upgrade and
> we don't control when end customers upgrade their running kernel. So I
> presume even adding to the stable tree may take years to see the result
> change. But if we don't do it, it may take way longer (since it does not
> contain a "Fixes" tag as well).
> 
> Thanks.
> -Mingwei
> 

I have tested the code. Yes profiling results in the VM shows that it
removes perf_adjust_freq_unthr_contex() as the hot spot. However, when
running perf with sufficient events in frequency mode that triggers
multiplexing. The overall performance overhead still reaches 60% per
CPU (this overhead is invisible to vCPU).

At the host level, I have been monitoring the write MSRs and found that
the repeated writes to 0x38f disappeared, indicating that this patch is
indeed working. But on the other hand, I have noticed more frequent
overflows and PMIs.

The more frequent overflows is shown in the writes to MSR 0x390 and
reads to MSR 0x38e. I infer the more frequent PMIs from the much longer
execution of 'vmx_vmexit()' shown in flamegraph.

Because of the above observation, it seems to me that this patch no
longer satisfies the requirements of Ccing "stable@vger.kernel.org". I
will double check and follow up on this one.

Thanks.
-Mingwei


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs
  2023-11-21 19:26       ` Liang, Kan
@ 2023-12-01 20:29         ` Namhyung Kim
  0 siblings, 0 replies; 15+ messages in thread
From: Namhyung Kim @ 2023-12-01 20:29 UTC (permalink / raw)
  To: Liang, Kan, Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Ian Rogers, Mingwei Zhang

On Tue, Nov 21, 2023 at 11:26 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 2023-11-21 1:30 p.m., Namhyung Kim wrote:
> > Hi Kan,
> >
> > On Tue, Nov 21, 2023 at 7:59 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >>
> >>
> >>
> >> On 2023-11-20 5:19 p.m., Namhyung Kim wrote:
> >>> It doesn't support sampling in uncore PMU events.  While it's
> >>> technically possible to generate interrupts, let's treat it as if it
> >>> has no interrupt in order to skip the freq adjust/unthrottling logic
> >>> in the timer handler which is only meaningful to sampling events.
> >>>
> >>> Also remove the sampling event check because it'd be done in the general
> >>> code in the perf_event_open syscall.
> >>>
> >>> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> >>> ---
> >>>  arch/x86/events/intel/uncore.c | 11 ++++++-----
> >>>  1 file changed, 6 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
> >>> index 69043e02e8a7..f7e6228bd1b1 100644
> >>> --- a/arch/x86/events/intel/uncore.c
> >>> +++ b/arch/x86/events/intel/uncore.c
> >>> @@ -744,10 +744,6 @@ static int uncore_pmu_event_init(struct perf_event *event)
> >>>       if (pmu->func_id < 0)
> >>>               return -ENOENT;
> >>>
> >>> -     /* Sampling not supported yet */
> >>> -     if (hwc->sample_period)
> >>> -             return -EINVAL;
> >>> -
> >>>       /*
> >>>        * Place all uncore events for a particular physical package
> >>>        * onto a single cpu
> >>> @@ -919,7 +915,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
> >>>                       .stop           = uncore_pmu_event_stop,
> >>>                       .read           = uncore_pmu_event_read,
> >>>                       .module         = THIS_MODULE,
> >>> -                     .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
> >>> +                     /*
> >>> +                      * It doesn't allow sampling for uncore events, let's
> >>> +                      * treat the PMU has no interrupts to skip them in the
> >>> +                      * perf_adjust_freq_unthr_context().
> >>> +                      */
> >>> +                     .capabilities   = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
> >>>                       .attr_update    = pmu->type->attr_update,
> >>>               };
> >>
> >>
> >> There is a special customized uncore PMU which needs the flag as well.
> >
> > Ok, I will add that too.
> >
> > Btw, during the work I noticed many PMU drivers didn't set the
> > CAP_NO_INTERRUPT flag even if they didn't support sampling and
> > rejected the sampling events manually in the ->event_init() callback.
> >
> > I guess it's because the name of the flag is somewhat misleading.
> > As the PMU drivers handle IRQ (for overflows), they thought they had
> > interrupts and didn't set the flag.  I think it'd be better to rename it to
> > CAP_NO_SAMPLING to reveal the intention.  And then we could just set
> > the flag in the pmu.capabilities and remove the manual checks.
> >
> > The benefit is it can skip the PMUs in the timer tick handler even if
> > it needs to unthrottle some events.  What do you think?
> >
>
> I agree. The current name is kind of misleading.
>
> The patch, which introduced the flag (commit id 53b25335dd60 ("perf:
> Disable sampled events if no PMU interrupt")), also tried to disable the
> sampled events on a no-sampling supported platform.
>
> The renaming sounds good to me.

Thank Kan for the review.

Peter and Ingo, would you please pick up the first two patches
if you don't have any concerns?  Then I can work on the
renaming on top.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context()
  2023-11-20 22:19 [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
                   ` (3 preceding siblings ...)
  2023-11-21 15:57 ` Liang, Kan
@ 2023-12-02  6:16 ` Mingwei Zhang
  4 siblings, 0 replies; 15+ messages in thread
From: Mingwei Zhang @ 2023-12-02  6:16 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Ian Rogers, Kan Liang

On Mon, Nov 20, 2023, Namhyung Kim wrote:
> It was unnecessarily disabling and enabling PMUs for each event.  It
> should be done at PMU level.  Add pmu_ctx->nr_freq counter to check it
> at each PMU.  As pmu context has separate active lists for pinned group
> and flexible group, factor out a new function to do the job.
> 
> Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> even if it needs to unthrottle sampling events.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Mingwei Zhang <mizhang@google.com>
> ---
>  include/linux/perf_event.h |  1 +
>  kernel/events/core.c       | 68 +++++++++++++++++++++++---------------
>  2 files changed, 43 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 0367d748fae0..3eb17dc89f5e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -879,6 +879,7 @@ struct perf_event_pmu_context {
>  
>  	unsigned int			nr_events;
>  	unsigned int			nr_cgroups;
> +	unsigned int			nr_freq;
>  
>  	atomic_t			refcount; /* event <-> epc */
>  	struct rcu_head			rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 3eb26c2c6e65..53e2ad73102d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2275,8 +2275,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
>  
>  	if (!is_software_event(event))
>  		cpc->active_oncpu--;
> -	if (event->attr.freq && event->attr.sample_freq)
> +	if (event->attr.freq && event->attr.sample_freq) {
>  		ctx->nr_freq--;
> +		epc->nr_freq--;
> +	}
>  	if (event->attr.exclusive || !cpc->active_oncpu)
>  		cpc->exclusive = 0;
>  
> @@ -2531,9 +2533,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>  
>  	if (!is_software_event(event))
>  		cpc->active_oncpu++;
> -	if (event->attr.freq && event->attr.sample_freq)
> +	if (event->attr.freq && event->attr.sample_freq) {
>  		ctx->nr_freq++;
> -
> +		epc->nr_freq++;
> +	}
>  	if (event->attr.exclusive)
>  		cpc->exclusive = 1;
>  
> @@ -4096,30 +4099,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
>  	}
>  }
>  
> -/*
> - * combine freq adjustment with unthrottling to avoid two passes over the
> - * events. At the same time, make sure, having freq events does not change
> - * the rate of unthrottling as that would introduce bias.
> - */
> -static void
> -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
>  {
>  	struct perf_event *event;
>  	struct hw_perf_event *hwc;
>  	u64 now, period = TICK_NSEC;
>  	s64 delta;
>  
> -	/*
> -	 * only need to iterate over all events iff:
> -	 * - context have events in frequency mode (needs freq adjust)
> -	 * - there are events to unthrottle on this cpu
> -	 */
> -	if (!(ctx->nr_freq || unthrottle))
> -		return;
> -
> -	raw_spin_lock(&ctx->lock);
> -
> -	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> +	list_for_each_entry(event, event_list, active_list) {
>  		if (event->state != PERF_EVENT_STATE_ACTIVE)
>  			continue;
>  
> @@ -4127,8 +4114,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  		if (!event_filter_match(event))
>  			continue;
>  
> -		perf_pmu_disable(event->pmu);
> -
>  		hwc = &event->hw;
>  
>  		if (hwc->interrupts == MAX_INTERRUPTS) {
> @@ -4138,7 +4123,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  		}
>  
>  		if (!event->attr.freq || !event->attr.sample_freq)
> -			goto next;
> +			continue;
>  
>  		/*
>  		 * stop the event and update event->count
> @@ -4160,8 +4145,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
>  			perf_adjust_period(event, period, delta, false);
>  
>  		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> -	next:
> -		perf_pmu_enable(event->pmu);
> +	}
> +}
> +
> +/*
> + * combine freq adjustment with unthrottling to avoid two passes over the
> + * events. At the same time, make sure, having freq events does not change
> + * the rate of unthrottling as that would introduce bias.
> + */
> +static void
> +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	/*
> +	 * only need to iterate over all events iff:
> +	 * - context have events in frequency mode (needs freq adjust)
> +	 * - there are events to unthrottle on this cpu
> +	 */
> +	if (!(ctx->nr_freq || unthrottle))
> +		return;
> +
> +	raw_spin_lock(&ctx->lock);
> +
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		if (!(pmu_ctx->nr_freq || unthrottle))
> +			continue;
> +		if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> +			continue;
> +
> +		perf_pmu_disable(pmu_ctx->pmu);
> +		perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> +		perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> +		perf_pmu_enable(pmu_ctx->pmu);
>  	}
>  
>  	raw_spin_unlock(&ctx->lock);
> -- 
> 2.43.0.rc1.413.gea7ed67945-goog
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-12-02  6:30 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-20 22:19 [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Namhyung Kim
2023-11-20 22:19 ` [PATCH 2/3] perf/core: Reduce PMU access to adjust sample freq Namhyung Kim
2023-11-21 15:57   ` Liang, Kan
2023-11-20 22:19 ` [PATCH 3/3] perf/x86: Add CAP_NO_INTERRUPT for uncore PMUs Namhyung Kim
2023-11-21 15:59   ` Liang, Kan
2023-11-21 18:30     ` Namhyung Kim
2023-11-21 19:26       ` Liang, Kan
2023-12-01 20:29         ` Namhyung Kim
2023-11-20 22:41 ` [PATCH 1/3] perf/core: Update perf_adjust_freq_unthr_context() Ian Rogers
2023-11-20 23:23   ` Mingwei Zhang
2023-11-21 18:21     ` Namhyung Kim
2023-11-21 23:01       ` Mingwei Zhang
2023-11-24  0:50         ` Mingwei Zhang
2023-11-21 15:57 ` Liang, Kan
2023-12-02  6:16 ` Mingwei Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox