[PATCH 0/5] perf: Per PMU context reschedule and misc

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/5] perf: Per PMU context reschedule and misc
@ 2024-08-07 11:29 Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
                   ` (6 more replies)
  0 siblings, 7 replies; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Hi,

This is 'fallout' from Namhyung posting his per-pmu ctx_resched() patches. It
started with me trying to clean up and get rid of corner cases, and then got
involved when Kan noted the time keeping issue.

Anyway, please review / test.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/5] perf: Optimize context reschedule for single PMU cases
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Currently re-scheduling a context will reschedule all active PMUs for
that context, even if it is known only a single event is added.

Namhyung reported that changing this to only reschedule the affected
PMU when possible provides significant performance gains under certain
conditions.

Therefore, allow partial context reschedules for a specific PMU, that
of the event modified.

While the patch looks somewhat noisy, it mostly just propagates a new
@pmu argument through the callchain and modifies the epc loop to only
pick the 'epc->pmu == @pmu' case.

Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |  164 +++++++++++++++++++++++++++------------------------
 1 file changed, 88 insertions(+), 76 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -685,30 +685,32 @@ do {									\
 	___p;								\
 })
 
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup)				\
+	list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+		if (_cgroup && !_epc->nr_cgroups)			\
+			continue;					\
+		else if (_pmu && _epc->pmu != _pmu)			\
+			continue;					\
+		else
+
 static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
 		perf_pmu_disable(pmu_ctx->pmu);
-	}
 }
 
 static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
 		perf_pmu_enable(pmu_ctx->pmu);
-	}
 }
 
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
 
 #ifdef CONFIG_CGROUP_PERF
 
@@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct ta
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_ctx_disable(&cpuctx->ctx, true);
 
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
 	/*
 	 * must not be done before ctxswout due
 	 * to update_cgrp_time_from_cpuctx() in
@@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct ta
 	 * perf_cgroup_set_timestamp() in ctx_sched_in()
 	 * to not have to pass task around
 	 */
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+	ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
 
 	perf_ctx_enable(&cpuctx->ctx, true);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
-				enum event_type_t event_type)
+			       struct pmu *pmu,
+			       enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 
@@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct pe
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, event_type);
+	ctx_sched_out(ctx, pmu, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
-				struct perf_event_context *ctx)
+				struct perf_event_context *ctx,
+				struct pmu *pmu)
 {
-	ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
 	if (ctx)
-		 ctx_sched_in(ctx, EVENT_PINNED);
-	ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
 	if (ctx)
-		 ctx_sched_in(ctx, EVENT_FLEXIBLE);
+		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
 }
 
 /*
@@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct p
  * event_type is a bit mask of the types of events involved. For CPU events,
  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
  */
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
 			struct perf_event_context *task_ctx,
-			enum event_type_t event_type)
+			struct pmu *pmu, enum event_type_t event_type)
 {
 	bool cpu_event = !!(event_type & EVENT_CPU);
+	struct perf_event_pmu_context *epc;
 
 	/*
 	 * If pinned groups are involved, flexible groups also need to be
@@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_
 
 	event_type &= EVENT_ALL;
 
-	perf_ctx_disable(&cpuctx->ctx, false);
+	for_each_epc(epc, &cpuctx->ctx, pmu, false)
+		perf_pmu_disable(epc->pmu);
+
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx, false);
-		task_ctx_sched_out(task_ctx, event_type);
+		for_each_epc(epc, task_ctx, pmu, false)
+			perf_pmu_disable(epc->pmu);
+
+		task_ctx_sched_out(task_ctx, pmu, event_type);
 	}
 
 	/*
@@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_
 	 *  - otherwise, do nothing more.
 	 */
 	if (cpu_event)
-		ctx_sched_out(&cpuctx->ctx, event_type);
+		ctx_sched_out(&cpuctx->ctx, pmu, event_type);
 	else if (event_type & EVENT_PINNED)
-		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
 
-	perf_event_sched_in(cpuctx, task_ctx);
+	perf_event_sched_in(cpuctx, task_ctx, pmu);
 
-	perf_ctx_enable(&cpuctx->ctx, false);
-	if (task_ctx)
-		perf_ctx_enable(task_ctx, false);
+	for_each_epc(epc, &cpuctx->ctx, pmu, false)
+		perf_pmu_enable(epc->pmu);
+
+	if (task_ctx) {
+		for_each_epc(epc, task_ctx, pmu, false)
+			perf_pmu_enable(epc->pmu);
+	}
 }
 
 void perf_pmu_resched(struct pmu *pmu)
@@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu)
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
 
 	perf_ctx_lock(cpuctx, task_ctx);
-	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+	ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
 	perf_ctx_unlock(cpuctx, task_ctx);
 }
 
@@ -2802,9 +2810,10 @@ static int  __perf_install_in_context(vo
 #endif
 
 	if (reprogram) {
-		ctx_sched_out(ctx, EVENT_TIME);
+		ctx_sched_out(ctx, NULL, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-		ctx_resched(cpuctx, task_ctx, get_event_type(event));
+		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+			    get_event_type(event));
 	} else {
 		add_event_to_ctx(event, ctx);
 	}
@@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct p
 		return;
 
 	if (ctx->is_active)
-		ctx_sched_out(ctx, EVENT_TIME);
+		ctx_sched_out(ctx, NULL, EVENT_TIME);
 
 	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 	perf_cgroup_event_enable(event, ctx);
@@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct p
 		return;
 
 	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 		return;
 	}
 
@@ -2966,7 +2975,7 @@ static void __perf_event_enable(struct p
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 		return;
 	}
 
@@ -2974,7 +2983,7 @@ static void __perf_event_enable(struct p
 	if (ctx->task)
 		WARN_ON_ONCE(task_ctx != ctx);
 
-	ctx_resched(cpuctx, task_ctx, get_event_type(event));
+	ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
 }
 
 /*
@@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct p
 	perf_pmu_enable(pmu);
 }
 
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
 static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
@@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context
 
 	is_active ^= ctx->is_active; /* changed bits */
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, pmu, cgroup)
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
-	}
 }
 
 /*
@@ -3579,7 +3594,7 @@ perf_event_context_sched_out(struct task
 
 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
-		task_ctx_sched_out(ctx, EVENT_ALL);
+		task_ctx_sched_out(ctx, NULL, EVENT_ALL);
 
 		perf_ctx_enable(ctx, false);
 		raw_spin_unlock(&ctx->lock);
@@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct p
 			   merge_sched_in, &can_add_hw);
 }
 
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
-				struct perf_event_groups *groups,
-				bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+			       enum event_type_t event_type)
 {
-	struct perf_event_pmu_context *pmu_ctx;
-
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
-		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
-	}
-}
+	struct perf_event_context *ctx = pmu_ctx->ctx;
 
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
-			       struct pmu *pmu)
-{
-	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+	if (event_type & EVENT_PINNED)
+		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+	if (event_type & EVENT_FLEXIBLE)
+		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
 }
 
 static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
 	bool cgroup = event_type & EVENT_CGROUP;
 
@@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (is_active & EVENT_PINNED)
-		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+	if (is_active & EVENT_PINNED) {
+		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+			__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+	}
 
 	/* Then walk through the lower prio flexible groups */
-	if (is_active & EVENT_FLEXIBLE)
-		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+	if (is_active & EVENT_FLEXIBLE) {
+		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+			__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+	}
 }
 
 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
 		perf_ctx_disable(&cpuctx->ctx, false);
-		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
 	}
 
-	perf_event_sched_in(cpuctx, ctx);
+	perf_event_sched_in(cpuctx, ctx, NULL);
 
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
@@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct p
 		update_context_time(&cpuctx->ctx);
 		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
 		rotate_ctx(&cpuctx->ctx, cpu_event);
-		__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+		__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
 	}
 
 	if (task_event)
 		rotate_ctx(task_epc->ctx, task_event);
 
 	if (task_event || (task_epc && cpu_event))
-		__pmu_ctx_sched_in(task_epc->ctx, pmu);
+		__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
 
 	perf_pmu_enable(pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(st
 
 	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, EVENT_TIME);
+	ctx_sched_out(ctx, NULL, EVENT_TIME);
 
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
@@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(st
 	 */
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
-		ctx_resched(cpuctx, ctx, event_type);
+		ctx_resched(cpuctx, ctx, NULL, event_type);
 	} else {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(child_ctx, EVENT_ALL);
+	task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation
@@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(vo
 	struct perf_event *event;
 
 	raw_spin_lock(&ctx->lock);
-	ctx_sched_out(ctx, EVENT_TIME);
+	ctx_sched_out(ctx, NULL, EVENT_TIME);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
 	raw_spin_unlock(&ctx->lock);



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 2/5] perf: Extract a few helpers
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

The context time update code is repeated verbatim a few times.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |   39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2330,6 +2330,24 @@ group_sched_out(struct perf_event *group
 		event_sched_out(event, ctx);
 }
 
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx, false);
+	}
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
+}
+
 #define DETACH_GROUP	0x01UL
 #define DETACH_CHILD	0x02UL
 #define DETACH_DEAD	0x04UL
@@ -2349,10 +2367,7 @@ __perf_remove_from_context(struct perf_e
 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
 	unsigned long flags = (unsigned long)info;
 
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
-	}
+	ctx_time_update(cpuctx, ctx);
 
 	/*
 	 * Ensure event_sched_out() switches to OFF, at the very least
@@ -2437,12 +2452,8 @@ static void __perf_event_disable(struct
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return;
 
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
-
 	perf_pmu_disable(event->pmu_ctx->pmu);
+	ctx_time_update_event(ctx, event);
 
 	if (event == event->group_leader)
 		group_sched_out(event, ctx);
@@ -4529,10 +4540,7 @@ static void __perf_event_read(void *info
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
+	ctx_time_update_event(ctx, event);
 
 	perf_event_update_time(event);
 	if (data->group)
@@ -4732,10 +4740,7 @@ static int perf_event_read(struct perf_e
 		 * May read while context is not active (e.g., thread is
 		 * blocked), in that case we cannot update context time
 		 */
-		if (ctx->is_active & EVENT_TIME) {
-			update_context_time(ctx);
-			update_cgrp_time_from_event(event);
-		}
+		ctx_time_update_event(ctx, event);
 
 		perf_event_update_time(event);
 		if (group)



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 3/5] perf: Fix event_function_call() locking
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

All the event_function/@func call context already uses perf_ctx_lock()
except for the !ctx->is_active case. Make it all consistent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |    9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -263,6 +263,7 @@ static int event_function(void *info)
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 	struct event_function_struct efs = {
 		.event = event,
@@ -291,22 +292,22 @@ static void event_function_call(struct p
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
-	raw_spin_lock_irq(&ctx->lock);
+	perf_ctx_lock(cpuctx, ctx);
 	/*
 	 * Reload the task pointer, it might have been changed by
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
 	if (task == TASK_TOMBSTONE) {
-		raw_spin_unlock_irq(&ctx->lock);
+		perf_ctx_unlock(cpuctx, ctx);
 		return;
 	}
 	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
+		perf_ctx_unlock(cpuctx, ctx);
 		goto again;
 	}
 	func(event, NULL, ctx, data);
-	raw_spin_unlock_irq(&ctx->lock);
+	perf_ctx_unlock(cpuctx, ctx);
 }
 
 /*



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 4/5] perf: Add context time freeze
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (2 preceding siblings ...)
  2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-07 15:17   ` Liang, Kan
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Many of the the context reschedule users are of the form:

  ctx_sched_out(.type = EVENT_TIME);
  ... modify context
  ctx_resched();

With the idea that the whole reschedule happens with a single
time-stamp, rather than with each ctx_sched_out() advancing time and
ctx_sched_in() re-starting time, creating a non-atomic experience.

However, Kan noticed that since this completely stops time, it
actually looses a bit of time between the stop and start. Worse, now
that we can do partial (per PMU) reschedules, the PMUs that are not
scheduled out still observe the time glitch.

Replace this with:

  ctx_time_freeze();
  ... modify context
  ctx_resched();

With the assumption that this happens in a perf_ctx_lock() /
perf_ctx_unlock() pair.

The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
perf_event_time_now() from observing a time wobble from not seeing
EVENT_TIME for a little while.

Additionally, this avoids loosing time between
ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
timestamp.

Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |  128 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 42 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, re
 	return data.ret;
 }
 
+enum event_type_t {
+	EVENT_FLEXIBLE	= 0x01,
+	EVENT_PINNED	= 0x02,
+	EVENT_TIME	= 0x04,
+	EVENT_FROZEN	= 0x08,
+	/* see ctx_resched() for details */
+	EVENT_CPU	= 0x10,
+	EVENT_CGROUP	= 0x20,
+
+	/* compound helpers */
+	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
+	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+	raw_spin_lock(&ctx->lock);
+	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 			  struct perf_event_context *ctx)
 {
-	raw_spin_lock(&cpuctx->ctx.lock);
+	__perf_ctx_lock(&cpuctx->ctx);
 	if (ctx)
-		raw_spin_lock(&ctx->lock);
+		__perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+	/*
+	 * If ctx_sched_in() didn't again set any ALL flags, clean up
+	 * after ctx_sched_out() by clearing is_active.
+	 */
+	if (ctx->is_active & EVENT_FROZEN) {
+		if (!(ctx->is_active & EVENT_ALL))
+			ctx->is_active = 0;
+		else
+			ctx->is_active &= ~EVENT_FROZEN;
+	}
+	raw_spin_unlock(&ctx->lock);
 }
 
 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 			    struct perf_event_context *ctx)
 {
 	if (ctx)
-		raw_spin_unlock(&ctx->lock);
-	raw_spin_unlock(&cpuctx->ctx.lock);
+		__perf_ctx_unlock(ctx);
+	__perf_ctx_unlock(&cpuctx->ctx);
 }
 
 #define TASK_TOMBSTONE ((void *)-1L)
@@ -370,16 +405,6 @@ static void event_function_local(struct
 	(PERF_SAMPLE_BRANCH_KERNEL |\
 	 PERF_SAMPLE_BRANCH_HV)
 
-enum event_type_t {
-	EVENT_FLEXIBLE = 0x1,
-	EVENT_PINNED = 0x2,
-	EVENT_TIME = 0x4,
-	/* see ctx_resched() for details */
-	EVENT_CPU = 0x8,
-	EVENT_CGROUP = 0x10,
-	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
 /*
  * perf_sched_events : >0 events exist
  */
@@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group
 }
 
 static inline void
-ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
 {
 	if (ctx->is_active & EVENT_TIME) {
+		if (ctx->is_active & EVENT_FROZEN)
+			return;
 		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
+		update_cgrp_time_from_cpuctx(cpuctx, final);
 	}
 }
 
 static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	__ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	ctx_time_update(cpuctx, ctx);
+	if (ctx->is_active & EVENT_TIME)
+		ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
 ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
 {
 	if (ctx->is_active & EVENT_TIME) {
+		if (ctx->is_active & EVENT_FROZEN)
+			return;
 		update_context_time(ctx);
 		update_cgrp_time_from_event(event);
 	}
@@ -2822,7 +2868,7 @@ static int  __perf_install_in_context(vo
 #endif
 
 	if (reprogram) {
-		ctx_sched_out(ctx, NULL, EVENT_TIME);
+		ctx_time_freeze(cpuctx, ctx);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
 			    get_event_type(event));
@@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct p
 	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
-	if (ctx->is_active)
-		ctx_sched_out(ctx, NULL, EVENT_TIME);
+	ctx_time_freeze(cpuctx, ctx);
 
 	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 	perf_cgroup_event_enable(event, ctx);
@@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct p
 	if (!ctx->is_active)
 		return;
 
-	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
+	if (!event_filter_match(event))
 		return;
-	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
 	 * then don't put it on unless the group is on.
 	 */
-	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 		return;
-	}
 
 	task_ctx = cpuctx->task_ctx;
 	if (ctx->task)
@@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
 	struct perf_event *event, *tmp;
 	struct pmu *pmu = pmu_ctx->pmu;
 
-	if (ctx->task && !ctx->is_active) {
+	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
 		struct perf_cpu_pmu_context *cpc;
 
 		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context
 	 *
 	 * would only update time for the pinned events.
 	 */
-	if (is_active & EVENT_TIME) {
-		/* update (and stop) ctx time */
-		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+	/*
+	 * CPU-release for the below ->is_active store,
+	 * see __load_acquire() in perf_event_time_now()
+	 */
+	barrier();
+	ctx->is_active &= ~event_type;
+
+	if (!(ctx->is_active & EVENT_ALL)) {
 		/*
-		 * CPU-release for the below ->is_active store,
-		 * see __load_acquire() in perf_event_time_now()
+		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+		 * does not observe a hole. perf_ctx_unlock() will clean up.
 		 */
-		barrier();
+		if (ctx->is_active & EVENT_FROZEN)
+			ctx->is_active &= EVENT_TIME_FROZEN;
+		else
+			ctx->is_active = 0;
 	}
 
-	ctx->is_active &= ~event_type;
-	if (!(ctx->is_active & EVENT_ALL))
-		ctx->is_active = 0;
-
 	if (ctx->task) {
 		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
-		if (!ctx->is_active)
+		if (!(ctx->is_active & EVENT_ALL))
 			cpuctx->task_ctx = NULL;
 	}
 
@@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *
 
 	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
-		if (!is_active)
+		if (!(is_active & EVENT_ALL))
 			cpuctx->task_ctx = ctx;
 		else
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(st
 
 	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, NULL, EVENT_TIME);
+	ctx_time_freeze(cpuctx, ctx);
 
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
@@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(st
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, NULL, event_type);
-	} else {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out()
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (3 preceding siblings ...)
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
  2024-08-07 15:19 ` [PATCH 0/5] perf: Per PMU context reschedule and misc Liang, Kan
  2024-08-07 18:54 ` Namhyung Kim
  6 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

There is is no point in doing the perf_pmu_disable() dance just to do
nothing. This happens for ctx_sched_out(.type = EVENT_TIME) for
instance.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3304,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
 		cpc->task_epc = NULL;
 	}
 
-	if (!event_type)
+	if (!(event_type & EVENT_ALL))
 		return;
 
 	perf_pmu_disable(pmu);



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] perf: Add context time freeze
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
@ 2024-08-07 15:17   ` Liang, Kan
  2024-08-07 19:09     ` Peter Zijlstra
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 20+ messages in thread
From: Liang, Kan @ 2024-08-07 15:17 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: acme, namhyung, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, linux-perf-users, linux-kernel



On 2024-08-07 7:29 a.m., Peter Zijlstra wrote:
> Many of the the context reschedule users are of the form:
> 
>   ctx_sched_out(.type = EVENT_TIME);
>   ... modify context
>   ctx_resched();
> 
> With the idea that the whole reschedule happens with a single
> time-stamp, rather than with each ctx_sched_out() advancing time and
> ctx_sched_in() re-starting time, creating a non-atomic experience.
> 
> However, Kan noticed that since this completely stops time, it
> actually looses a bit of time between the stop and start. Worse, now
> that we can do partial (per PMU) reschedules, the PMUs that are not
> scheduled out still observe the time glitch.
> 
> Replace this with:
> 
>   ctx_time_freeze();
>   ... modify context
>   ctx_resched();
> 
> With the assumption that this happens in a perf_ctx_lock() /
> perf_ctx_unlock() pair.
> 
> The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
> ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
> perf_event_time_now() from observing a time wobble from not seeing
> EVENT_TIME for a little while.
> 
> Additionally, this avoids loosing time between
> ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
> timestamp.
> 
> Reported-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/events/core.c |  128 ++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 86 insertions(+), 42 deletions(-)
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, re
>  	return data.ret;
>  }
>  
> +enum event_type_t {
> +	EVENT_FLEXIBLE	= 0x01,
> +	EVENT_PINNED	= 0x02,
> +	EVENT_TIME	= 0x04,
> +	EVENT_FROZEN	= 0x08,
> +	/* see ctx_resched() for details */
> +	EVENT_CPU	= 0x10,
> +	EVENT_CGROUP	= 0x20,
> +
> +	/* compound helpers */
> +	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
> +	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
> +};
> +
> +static inline void __perf_ctx_lock(struct perf_event_context *ctx)
> +{
> +	raw_spin_lock(&ctx->lock);
> +	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
> +}
> +
>  static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
>  			  struct perf_event_context *ctx)
>  {
> -	raw_spin_lock(&cpuctx->ctx.lock);
> +	__perf_ctx_lock(&cpuctx->ctx);
>  	if (ctx)
> -		raw_spin_lock(&ctx->lock);
> +		__perf_ctx_lock(ctx);
> +}
> +
> +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> +{
> +	/*
> +	 * If ctx_sched_in() didn't again set any ALL flags, clean up
> +	 * after ctx_sched_out() by clearing is_active.
> +	 */
> +	if (ctx->is_active & EVENT_FROZEN) {
> +		if (!(ctx->is_active & EVENT_ALL))

Nit:
It may be better to add a macro/inline function to replace all the
(ctx->is_active & EVENT_ALL) check? For example,

+static inline bool perf_ctx_has_active_events(struct perf_event_context
*ctx)
+{
+	return ctx->is_active & EVENT_ALL;
+}
...
+	if (ctx->is_active & EVENT_FROZEN) {
+		if (!perf_ctx_has_active_events(ctx))
+			ctx->is_active = 0;
+		else
+			ctx->is_active &= ~EVENT_FROZEN;

It can tell very straightforwardly that we want to clear all flags if
there is no active event.
The EVENT_ALL may bring confusion. It actually means all events, not all
event types. The developer may have to go to the define and figure out
what exactly the EVENT_ALL includes.

Thanks,
Kan

> +			ctx->is_active = 0;
> +		else
> +			ctx->is_active &= ~EVENT_FROZEN;
> +	}
> +	raw_spin_unlock(&ctx->lock);
>  }
>  
>  static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
>  			    struct perf_event_context *ctx)
>  {
>  	if (ctx)
> -		raw_spin_unlock(&ctx->lock);
> -	raw_spin_unlock(&cpuctx->ctx.lock);
> +		__perf_ctx_unlock(ctx);
> +	__perf_ctx_unlock(&cpuctx->ctx);
>  }
>  
>  #define TASK_TOMBSTONE ((void *)-1L)
> @@ -370,16 +405,6 @@ static void event_function_local(struct
>  	(PERF_SAMPLE_BRANCH_KERNEL |\
>  	 PERF_SAMPLE_BRANCH_HV)
>  
> -enum event_type_t {
> -	EVENT_FLEXIBLE = 0x1,
> -	EVENT_PINNED = 0x2,
> -	EVENT_TIME = 0x4,
> -	/* see ctx_resched() for details */
> -	EVENT_CPU = 0x8,
> -	EVENT_CGROUP = 0x10,
> -	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> -};
> -
>  /*
>   * perf_sched_events : >0 events exist
>   */
> @@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group
>  }
>  
>  static inline void
> -ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
>  {
>  	if (ctx->is_active & EVENT_TIME) {
> +		if (ctx->is_active & EVENT_FROZEN)
> +			return;
>  		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, false);
> +		update_cgrp_time_from_cpuctx(cpuctx, final);
>  	}
>  }
>  
>  static inline void
> +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> +	__ctx_time_update(cpuctx, ctx, false);
> +}
> +
> +/*
> + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
> + */
> +static inline void
> +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> +	ctx_time_update(cpuctx, ctx);
> +	if (ctx->is_active & EVENT_TIME)
> +		ctx->is_active |= EVENT_FROZEN;
> +}
> +
> +static inline void
>  ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
>  {
>  	if (ctx->is_active & EVENT_TIME) {
> +		if (ctx->is_active & EVENT_FROZEN)
> +			return;
>  		update_context_time(ctx);
>  		update_cgrp_time_from_event(event);
>  	}
> @@ -2822,7 +2868,7 @@ static int  __perf_install_in_context(vo
>  #endif
>  
>  	if (reprogram) {
> -		ctx_sched_out(ctx, NULL, EVENT_TIME);
> +		ctx_time_freeze(cpuctx, ctx);
>  		add_event_to_ctx(event, ctx);
>  		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
>  			    get_event_type(event));
> @@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct p
>  	    event->state <= PERF_EVENT_STATE_ERROR)
>  		return;
>  
> -	if (ctx->is_active)
> -		ctx_sched_out(ctx, NULL, EVENT_TIME);
> +	ctx_time_freeze(cpuctx, ctx);
>  
>  	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
>  	perf_cgroup_event_enable(event, ctx);
> @@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct p
>  	if (!ctx->is_active)
>  		return;
>  
> -	if (!event_filter_match(event)) {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
> +	if (!event_filter_match(event))
>  		return;
> -	}
>  
>  	/*
>  	 * If the event is in a group and isn't the group leader,
>  	 * then don't put it on unless the group is on.
>  	 */
> -	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
> +	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
>  		return;
> -	}
>  
>  	task_ctx = cpuctx->task_ctx;
>  	if (ctx->task)
> @@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
>  	struct perf_event *event, *tmp;
>  	struct pmu *pmu = pmu_ctx->pmu;
>  
> -	if (ctx->task && !ctx->is_active) {
> +	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
>  		struct perf_cpu_pmu_context *cpc;
>  
>  		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> @@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context
>  	 *
>  	 * would only update time for the pinned events.
>  	 */
> -	if (is_active & EVENT_TIME) {
> -		/* update (and stop) ctx time */
> -		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> +	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
> +
> +	/*
> +	 * CPU-release for the below ->is_active store,
> +	 * see __load_acquire() in perf_event_time_now()
> +	 */
> +	barrier();
> +	ctx->is_active &= ~event_type;
> +
> +	if (!(ctx->is_active & EVENT_ALL)) {
>  		/*
> -		 * CPU-release for the below ->is_active store,
> -		 * see __load_acquire() in perf_event_time_now()
> +		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
> +		 * does not observe a hole. perf_ctx_unlock() will clean up.
>  		 */
> -		barrier();
> +		if (ctx->is_active & EVENT_FROZEN)
> +			ctx->is_active &= EVENT_TIME_FROZEN;
> +		else
> +			ctx->is_active = 0;
>  	}
>  
> -	ctx->is_active &= ~event_type;
> -	if (!(ctx->is_active & EVENT_ALL))
> -		ctx->is_active = 0;
> -
>  	if (ctx->task) {
>  		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> -		if (!ctx->is_active)
> +		if (!(ctx->is_active & EVENT_ALL))
>  			cpuctx->task_ctx = NULL;
>  	}
>  
> @@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *
>  
>  	ctx->is_active |= (event_type | EVENT_TIME);
>  	if (ctx->task) {
> -		if (!is_active)
> +		if (!(is_active & EVENT_ALL))
>  			cpuctx->task_ctx = ctx;
>  		else
>  			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> @@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(st
>  
>  	cpuctx = this_cpu_ptr(&perf_cpu_context);
>  	perf_ctx_lock(cpuctx, ctx);
> -	ctx_sched_out(ctx, NULL, EVENT_TIME);
> +	ctx_time_freeze(cpuctx, ctx);
>  
>  	list_for_each_entry(event, &ctx->event_list, event_entry) {
>  		enabled |= event_enable_on_exec(event, ctx);
> @@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(st
>  	if (enabled) {
>  		clone_ctx = unclone_ctx(ctx);
>  		ctx_resched(cpuctx, ctx, NULL, event_type);
> -	} else {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
>  	}
>  	perf_ctx_unlock(cpuctx, ctx);
>  
> 
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] perf: Per PMU context reschedule and misc
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (4 preceding siblings ...)
  2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
@ 2024-08-07 15:19 ` Liang, Kan
  2024-08-07 18:54 ` Namhyung Kim
  6 siblings, 0 replies; 20+ messages in thread
From: Liang, Kan @ 2024-08-07 15:19 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: acme, namhyung, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, linux-perf-users, linux-kernel



On 2024-08-07 7:29 a.m., Peter Zijlstra wrote:
> Hi,
> 
> This is 'fallout' from Namhyung posting his per-pmu ctx_resched() patches. It
> started with me trying to clean up and get rid of corner cases, and then got
> involved when Kan noted the time keeping issue.
> 
> Anyway, please review / test.
> 

Except for the tiny nit, the patch series looks good to me.

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>

Thanks,
Kan

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] perf: Per PMU context reschedule and misc
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (5 preceding siblings ...)
  2024-08-07 15:19 ` [PATCH 0/5] perf: Per PMU context reschedule and misc Liang, Kan
@ 2024-08-07 18:54 ` Namhyung Kim
  6 siblings, 0 replies; 20+ messages in thread
From: Namhyung Kim @ 2024-08-07 18:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Hi Peter,

On Wed, Aug 7, 2024 at 4:56 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Hi,
>
> This is 'fallout' from Namhyung posting his per-pmu ctx_resched() patches. It
> started with me trying to clean up and get rid of corner cases, and then got
> involved when Kan noted the time keeping issue.
>
> Anyway, please review / test.

It works blazingly fast!

  # ./stress-pmu
  delta: 0.000307 sec (3 usec/op)

I found a problem with my patch that it called __pmu_ctx_sched_out() for
nothing (I guess is_active only has EVENT_TIME).  I thought ctx_sched_out()
would stop if it doesn't change EVENT_ALL but it iterated all PMUs anyway.

But with this change we don't need ctx_sched_out(EVENT_TIME) anymore.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] perf: Add context time freeze
  2024-08-07 15:17   ` Liang, Kan
@ 2024-08-07 19:09     ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-07 19:09 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, linux-perf-users, linux-kernel

On Wed, Aug 07, 2024 at 11:17:18AM -0400, Liang, Kan wrote:

> > +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> > +{
> > +	/*
> > +	 * If ctx_sched_in() didn't again set any ALL flags, clean up
> > +	 * after ctx_sched_out() by clearing is_active.
> > +	 */
> > +	if (ctx->is_active & EVENT_FROZEN) {
> > +		if (!(ctx->is_active & EVENT_ALL))
> 
> Nit:
> It may be better to add a macro/inline function to replace all the
> (ctx->is_active & EVENT_ALL) check? For example,
> 
> +static inline bool perf_ctx_has_active_events(struct perf_event_context
> *ctx)
> +{
> +	return ctx->is_active & EVENT_ALL;
> +}
> ...
> +	if (ctx->is_active & EVENT_FROZEN) {
> +		if (!perf_ctx_has_active_events(ctx))
> +			ctx->is_active = 0;
> +		else
> +			ctx->is_active &= ~EVENT_FROZEN;
> 
> It can tell very straightforwardly that we want to clear all flags if
> there is no active event.
> The EVENT_ALL may bring confusion. It actually means all events, not all
> event types. The developer may have to go to the define and figure out
> what exactly the EVENT_ALL includes.

I'll push this on the todo list. I'm running short of time before I'm
taking a holiday and need to also spend time looking at the sched_ext
thing.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [tip: perf/core] perf: Optimize __pmu_ctx_sched_out()
  2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
@ 2024-08-08 10:32   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-08-08 10:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Kan Liang, Namhyung Kim, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     3e15a3fe3a2a170c5be52783667706875c088f96
Gitweb:        https://git.kernel.org/tip/3e15a3fe3a2a170c5be52783667706875c088f96
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 07 Aug 2024 13:29:29 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 08 Aug 2024 12:27:32 +02:00

perf: Optimize __pmu_ctx_sched_out()

There is is no point in doing the perf_pmu_disable() dance just to do
nothing. This happens for ctx_sched_out(.type = EVENT_TIME) for
instance.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807115550.392851915@infradead.org
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 197d3be..9893ba5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3312,7 +3312,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
 		cpc->task_epc = NULL;
 	}
 
-	if (!event_type)
+	if (!(event_type & EVENT_ALL))
 		return;
 
 	perf_pmu_disable(pmu);

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [tip: perf/core] perf: Add context time freeze
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
  2024-08-07 15:17   ` Liang, Kan
@ 2024-08-08 10:32   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 20+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-08-08 10:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Kan Liang, Peter Zijlstra (Intel), Namhyung Kim, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     5d95a2af973d47260b1e1828953fc860c0094052
Gitweb:        https://git.kernel.org/tip/5d95a2af973d47260b1e1828953fc860c0094052
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 07 Aug 2024 13:29:28 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 08 Aug 2024 12:27:32 +02:00

perf: Add context time freeze

Many of the the context reschedule users are of the form:

  ctx_sched_out(.type = EVENT_TIME);
  ... modify context
  ctx_resched();

With the idea that the whole reschedule happens with a single
time-stamp, rather than with each ctx_sched_out() advancing time and
ctx_sched_in() re-starting time, creating a non-atomic experience.

However, Kan noticed that since this completely stops time, it
actually looses a bit of time between the stop and start. Worse, now
that we can do partial (per PMU) reschedules, the PMUs that are not
scheduled out still observe the time glitch.

Replace this with:

  ctx_time_freeze();
  ... modify context
  ctx_resched();

With the assumption that this happens in a perf_ctx_lock() /
perf_ctx_unlock() pair.

The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
perf_event_time_now() from observing a time wobble from not seeing
EVENT_TIME for a little while.

Additionally, this avoids loosing time between
ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
timestamp.

Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807115550.250637571@infradead.org
---
 kernel/events/core.c | 128 ++++++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 42 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab49dea..197d3be 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
 	return data.ret;
 }
 
+enum event_type_t {
+	EVENT_FLEXIBLE	= 0x01,
+	EVENT_PINNED	= 0x02,
+	EVENT_TIME	= 0x04,
+	EVENT_FROZEN	= 0x08,
+	/* see ctx_resched() for details */
+	EVENT_CPU	= 0x10,
+	EVENT_CGROUP	= 0x20,
+
+	/* compound helpers */
+	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
+	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+	raw_spin_lock(&ctx->lock);
+	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 			  struct perf_event_context *ctx)
 {
-	raw_spin_lock(&cpuctx->ctx.lock);
+	__perf_ctx_lock(&cpuctx->ctx);
 	if (ctx)
-		raw_spin_lock(&ctx->lock);
+		__perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+	/*
+	 * If ctx_sched_in() didn't again set any ALL flags, clean up
+	 * after ctx_sched_out() by clearing is_active.
+	 */
+	if (ctx->is_active & EVENT_FROZEN) {
+		if (!(ctx->is_active & EVENT_ALL))
+			ctx->is_active = 0;
+		else
+			ctx->is_active &= ~EVENT_FROZEN;
+	}
+	raw_spin_unlock(&ctx->lock);
 }
 
 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 			    struct perf_event_context *ctx)
 {
 	if (ctx)
-		raw_spin_unlock(&ctx->lock);
-	raw_spin_unlock(&cpuctx->ctx.lock);
+		__perf_ctx_unlock(ctx);
+	__perf_ctx_unlock(&cpuctx->ctx);
 }
 
 #define TASK_TOMBSTONE ((void *)-1L)
@@ -370,16 +405,6 @@ unlock:
 	(PERF_SAMPLE_BRANCH_KERNEL |\
 	 PERF_SAMPLE_BRANCH_HV)
 
-enum event_type_t {
-	EVENT_FLEXIBLE = 0x1,
-	EVENT_PINNED = 0x2,
-	EVENT_TIME = 0x4,
-	/* see ctx_resched() for details */
-	EVENT_CPU = 0x8,
-	EVENT_CGROUP = 0x10,
-	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
 /*
  * perf_sched_events : >0 events exist
  */
@@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
 }
 
 static inline void
-ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
 {
 	if (ctx->is_active & EVENT_TIME) {
+		if (ctx->is_active & EVENT_FROZEN)
+			return;
 		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
+		update_cgrp_time_from_cpuctx(cpuctx, final);
 	}
 }
 
 static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	__ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	ctx_time_update(cpuctx, ctx);
+	if (ctx->is_active & EVENT_TIME)
+		ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
 ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
 {
 	if (ctx->is_active & EVENT_TIME) {
+		if (ctx->is_active & EVENT_FROZEN)
+			return;
 		update_context_time(ctx);
 		update_cgrp_time_from_event(event);
 	}
@@ -2822,7 +2868,7 @@ static int  __perf_install_in_context(void *info)
 #endif
 
 	if (reprogram) {
-		ctx_sched_out(ctx, NULL, EVENT_TIME);
+		ctx_time_freeze(cpuctx, ctx);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
 			    get_event_type(event));
@@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct perf_event *event,
 	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
-	if (ctx->is_active)
-		ctx_sched_out(ctx, NULL, EVENT_TIME);
+	ctx_time_freeze(cpuctx, ctx);
 
 	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 	perf_cgroup_event_enable(event, ctx);
@@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct perf_event *event,
 	if (!ctx->is_active)
 		return;
 
-	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
+	if (!event_filter_match(event))
 		return;
-	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
 	 * then don't put it on unless the group is on.
 	 */
-	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 		return;
-	}
 
 	task_ctx = cpuctx->task_ctx;
 	if (ctx->task)
@@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
 	struct perf_event *event, *tmp;
 	struct pmu *pmu = pmu_ctx->pmu;
 
-	if (ctx->task && !ctx->is_active) {
+	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
 		struct perf_cpu_pmu_context *cpc;
 
 		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
 	 *
 	 * would only update time for the pinned events.
 	 */
-	if (is_active & EVENT_TIME) {
-		/* update (and stop) ctx time */
-		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+	/*
+	 * CPU-release for the below ->is_active store,
+	 * see __load_acquire() in perf_event_time_now()
+	 */
+	barrier();
+	ctx->is_active &= ~event_type;
+
+	if (!(ctx->is_active & EVENT_ALL)) {
 		/*
-		 * CPU-release for the below ->is_active store,
-		 * see __load_acquire() in perf_event_time_now()
+		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+		 * does not observe a hole. perf_ctx_unlock() will clean up.
 		 */
-		barrier();
+		if (ctx->is_active & EVENT_FROZEN)
+			ctx->is_active &= EVENT_TIME_FROZEN;
+		else
+			ctx->is_active = 0;
 	}
 
-	ctx->is_active &= ~event_type;
-	if (!(ctx->is_active & EVENT_ALL))
-		ctx->is_active = 0;
-
 	if (ctx->task) {
 		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
-		if (!ctx->is_active)
+		if (!(ctx->is_active & EVENT_ALL))
 			cpuctx->task_ctx = NULL;
 	}
 
@@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t 
 
 	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
-		if (!is_active)
+		if (!(is_active & EVENT_ALL))
 			cpuctx->task_ctx = ctx;
 		else
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, NULL, EVENT_TIME);
+	ctx_time_freeze(cpuctx, ctx);
 
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
@@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, NULL, event_type);
-	} else {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [tip: perf/core] perf: Fix event_function_call() locking
  2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
@ 2024-08-08 10:32   ` tip-bot2 for Peter Zijlstra
  2024-08-13  1:34     ` Pengfei Xu
  0 siblings, 1 reply; 20+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-08-08 10:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Kan Liang, Namhyung Kim, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     558abc7e3f895049faa46b08656be4c60dc6e9fd
Gitweb:        https://git.kernel.org/tip/558abc7e3f895049faa46b08656be4c60dc6e9fd
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 07 Aug 2024 13:29:27 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 08 Aug 2024 12:27:31 +02:00

perf: Fix event_function_call() locking

All the event_function/@func call context already uses perf_ctx_lock()
except for the !ctx->is_active case. Make it all consistent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807115550.138301094@infradead.org
---
 kernel/events/core.c |  9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index eb03c9a..ab49dea 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -263,6 +263,7 @@ unlock:
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 	struct event_function_struct efs = {
 		.event = event,
@@ -291,22 +292,22 @@ again:
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
-	raw_spin_lock_irq(&ctx->lock);
+	perf_ctx_lock(cpuctx, ctx);
 	/*
 	 * Reload the task pointer, it might have been changed by
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
 	if (task == TASK_TOMBSTONE) {
-		raw_spin_unlock_irq(&ctx->lock);
+		perf_ctx_unlock(cpuctx, ctx);
 		return;
 	}
 	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
+		perf_ctx_unlock(cpuctx, ctx);
 		goto again;
 	}
 	func(event, NULL, ctx, data);
-	raw_spin_unlock_irq(&ctx->lock);
+	perf_ctx_unlock(cpuctx, ctx);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [tip: perf/core] perf: Extract a few helpers
  2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
@ 2024-08-08 10:32   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-08-08 10:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Kan Liang, Namhyung Kim, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     9a32bd9901fe5b1dcf544389dbf04f3b0a2fbab4
Gitweb:        https://git.kernel.org/tip/9a32bd9901fe5b1dcf544389dbf04f3b0a2fbab4
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 07 Aug 2024 13:29:26 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 08 Aug 2024 12:27:31 +02:00

perf: Extract a few helpers

The context time update code is repeated verbatim a few times.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807115550.031212518@infradead.org
---
 kernel/events/core.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index dad2b9a..eb03c9a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2330,6 +2330,24 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
 		event_sched_out(event, ctx);
 }
 
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx, false);
+	}
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
+}
+
 #define DETACH_GROUP	0x01UL
 #define DETACH_CHILD	0x02UL
 #define DETACH_DEAD	0x04UL
@@ -2349,10 +2367,7 @@ __perf_remove_from_context(struct perf_event *event,
 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
 	unsigned long flags = (unsigned long)info;
 
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
-	}
+	ctx_time_update(cpuctx, ctx);
 
 	/*
 	 * Ensure event_sched_out() switches to OFF, at the very least
@@ -2437,12 +2452,8 @@ static void __perf_event_disable(struct perf_event *event,
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return;
 
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
-
 	perf_pmu_disable(event->pmu_ctx->pmu);
+	ctx_time_update_event(ctx, event);
 
 	if (event == event->group_leader)
 		group_sched_out(event, ctx);
@@ -4529,10 +4540,7 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
+	ctx_time_update_event(ctx, event);
 
 	perf_event_update_time(event);
 	if (data->group)
@@ -4732,10 +4740,7 @@ again:
 		 * May read while context is not active (e.g., thread is
 		 * blocked), in that case we cannot update context time
 		 */
-		if (ctx->is_active & EVENT_TIME) {
-			update_context_time(ctx);
-			update_cgrp_time_from_event(event);
-		}
+		ctx_time_update_event(ctx, event);
 
 		perf_event_update_time(event);
 		if (group)

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [tip: perf/core] perf: Optimize context reschedule for single PMU cases
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
@ 2024-08-08 10:32   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-08-08 10:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Namhyung Kim, Peter Zijlstra (Intel), Kan Liang, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     2d17cf1abcbe8a45b7dc41a768ed22aac158ddd8
Gitweb:        https://git.kernel.org/tip/2d17cf1abcbe8a45b7dc41a768ed22aac158ddd8
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 07 Aug 2024 13:29:25 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 08 Aug 2024 12:27:31 +02:00

perf: Optimize context reschedule for single PMU cases

Currently re-scheduling a context will reschedule all active PMUs for
that context, even if it is known only a single event is added.

Namhyung reported that changing this to only reschedule the affected
PMU when possible provides significant performance gains under certain
conditions.

Therefore, allow partial context reschedules for a specific PMU, that
of the event modified.

While the patch looks somewhat noisy, it mostly just propagates a new
@pmu argument through the callchain and modifies the epc loop to only
pick the 'epc->pmu == @pmu' case.

Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807115549.920950699@infradead.org
---
 kernel/events/core.c | 164 ++++++++++++++++++++++--------------------
 1 file changed, 88 insertions(+), 76 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c01a326..dad2b9a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -685,30 +685,32 @@ do {									\
 	___p;								\
 })
 
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup)				\
+	list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+		if (_cgroup && !_epc->nr_cgroups)			\
+			continue;					\
+		else if (_pmu && _epc->pmu != _pmu)			\
+			continue;					\
+		else
+
 static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
 		perf_pmu_disable(pmu_ctx->pmu);
-	}
 }
 
 static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
 		perf_pmu_enable(pmu_ctx->pmu);
-	}
 }
 
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
 
 #ifdef CONFIG_CGROUP_PERF
 
@@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task)
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_ctx_disable(&cpuctx->ctx, true);
 
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
 	/*
 	 * must not be done before ctxswout due
 	 * to update_cgrp_time_from_cpuctx() in
@@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task)
 	 * perf_cgroup_set_timestamp() in ctx_sched_in()
 	 * to not have to pass task around
 	 */
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+	ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
 
 	perf_ctx_enable(&cpuctx->ctx, true);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf_event *event,
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
-				enum event_type_t event_type)
+			       struct pmu *pmu,
+			       enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 
@@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, event_type);
+	ctx_sched_out(ctx, pmu, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
-				struct perf_event_context *ctx)
+				struct perf_event_context *ctx,
+				struct pmu *pmu)
 {
-	ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
 	if (ctx)
-		 ctx_sched_in(ctx, EVENT_PINNED);
-	ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
 	if (ctx)
-		 ctx_sched_in(ctx, EVENT_FLEXIBLE);
+		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
 }
 
 /*
@@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
  * event_type is a bit mask of the types of events involved. For CPU events,
  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
  */
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
 			struct perf_event_context *task_ctx,
-			enum event_type_t event_type)
+			struct pmu *pmu, enum event_type_t event_type)
 {
 	bool cpu_event = !!(event_type & EVENT_CPU);
+	struct perf_event_pmu_context *epc;
 
 	/*
 	 * If pinned groups are involved, flexible groups also need to be
@@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	event_type &= EVENT_ALL;
 
-	perf_ctx_disable(&cpuctx->ctx, false);
+	for_each_epc(epc, &cpuctx->ctx, pmu, false)
+		perf_pmu_disable(epc->pmu);
+
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx, false);
-		task_ctx_sched_out(task_ctx, event_type);
+		for_each_epc(epc, task_ctx, pmu, false)
+			perf_pmu_disable(epc->pmu);
+
+		task_ctx_sched_out(task_ctx, pmu, event_type);
 	}
 
 	/*
@@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	 *  - otherwise, do nothing more.
 	 */
 	if (cpu_event)
-		ctx_sched_out(&cpuctx->ctx, event_type);
+		ctx_sched_out(&cpuctx->ctx, pmu, event_type);
 	else if (event_type & EVENT_PINNED)
-		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+
+	perf_event_sched_in(cpuctx, task_ctx, pmu);
 
-	perf_event_sched_in(cpuctx, task_ctx);
+	for_each_epc(epc, &cpuctx->ctx, pmu, false)
+		perf_pmu_enable(epc->pmu);
 
-	perf_ctx_enable(&cpuctx->ctx, false);
-	if (task_ctx)
-		perf_ctx_enable(task_ctx, false);
+	if (task_ctx) {
+		for_each_epc(epc, task_ctx, pmu, false)
+			perf_pmu_enable(epc->pmu);
+	}
 }
 
 void perf_pmu_resched(struct pmu *pmu)
@@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu)
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
 
 	perf_ctx_lock(cpuctx, task_ctx);
-	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+	ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
 	perf_ctx_unlock(cpuctx, task_ctx);
 }
 
@@ -2802,9 +2810,10 @@ static int  __perf_install_in_context(void *info)
 #endif
 
 	if (reprogram) {
-		ctx_sched_out(ctx, EVENT_TIME);
+		ctx_sched_out(ctx, NULL, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-		ctx_resched(cpuctx, task_ctx, get_event_type(event));
+		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+			    get_event_type(event));
 	} else {
 		add_event_to_ctx(event, ctx);
 	}
@@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct perf_event *event,
 		return;
 
 	if (ctx->is_active)
-		ctx_sched_out(ctx, EVENT_TIME);
+		ctx_sched_out(ctx, NULL, EVENT_TIME);
 
 	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 	perf_cgroup_event_enable(event, ctx);
@@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct perf_event *event,
 		return;
 
 	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 		return;
 	}
 
@@ -2966,7 +2975,7 @@ static void __perf_event_enable(struct perf_event *event,
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 		return;
 	}
 
@@ -2974,7 +2983,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (ctx->task)
 		WARN_ON_ONCE(task_ctx != ctx);
 
-	ctx_resched(cpuctx, task_ctx, get_event_type(event));
+	ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
 }
 
 /*
@@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
 	perf_pmu_enable(pmu);
 }
 
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
 static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
@@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 
 	is_active ^= ctx->is_active; /* changed bits */
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, pmu, cgroup)
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
-	}
 }
 
 /*
@@ -3579,7 +3594,7 @@ unlock:
 
 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
-		task_ctx_sched_out(ctx, EVENT_ALL);
+		task_ctx_sched_out(ctx, NULL, EVENT_ALL);
 
 		perf_ctx_enable(ctx, false);
 		raw_spin_unlock(&ctx->lock);
@@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
 			   merge_sched_in, &can_add_hw);
 }
 
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
-				struct perf_event_groups *groups,
-				bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+			       enum event_type_t event_type)
 {
-	struct perf_event_pmu_context *pmu_ctx;
-
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
-		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
-	}
-}
+	struct perf_event_context *ctx = pmu_ctx->ctx;
 
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
-			       struct pmu *pmu)
-{
-	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+	if (event_type & EVENT_PINNED)
+		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+	if (event_type & EVENT_FLEXIBLE)
+		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
 }
 
 static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
 	bool cgroup = event_type & EVENT_CGROUP;
 
@@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (is_active & EVENT_PINNED)
-		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+	if (is_active & EVENT_PINNED) {
+		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+			__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+	}
 
 	/* Then walk through the lower prio flexible groups */
-	if (is_active & EVENT_FLEXIBLE)
-		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+	if (is_active & EVENT_FLEXIBLE) {
+		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+			__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+	}
 }
 
 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
 		perf_ctx_disable(&cpuctx->ctx, false);
-		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
 	}
 
-	perf_event_sched_in(cpuctx, ctx);
+	perf_event_sched_in(cpuctx, ctx, NULL);
 
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
@@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
 		update_context_time(&cpuctx->ctx);
 		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
 		rotate_ctx(&cpuctx->ctx, cpu_event);
-		__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+		__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
 	}
 
 	if (task_event)
 		rotate_ctx(task_epc->ctx, task_event);
 
 	if (task_event || (task_epc && cpu_event))
-		__pmu_ctx_sched_in(task_epc->ctx, pmu);
+		__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
 
 	perf_pmu_enable(pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, EVENT_TIME);
+	ctx_sched_out(ctx, NULL, EVENT_TIME);
 
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
@@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 	 */
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
-		ctx_resched(cpuctx, ctx, event_type);
+		ctx_resched(cpuctx, ctx, NULL, event_type);
 	} else {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(child_ctx, EVENT_ALL);
+	task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation
@@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(void *__info)
 	struct perf_event *event;
 
 	raw_spin_lock(&ctx->lock);
-	ctx_sched_out(ctx, EVENT_TIME);
+	ctx_sched_out(ctx, NULL, EVENT_TIME);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
 	raw_spin_unlock(&ctx->lock);

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [tip: perf/core] perf: Fix event_function_call() locking
  2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
@ 2024-08-13  1:34     ` Pengfei Xu
  2024-08-13 15:19       ` Naresh Kamboju
  0 siblings, 1 reply; 20+ messages in thread
From: Pengfei Xu @ 2024-08-13  1:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-tip-commits, Peter Zijlstra (Intel), Kan Liang,
	Namhyung Kim, x86, syzkaller-bugs

Hi Peter and perf experts,

There is "BUG: using smp_processor_id() in preemptible code in
event_function_call" in v6.11-rc3 Intel internal with linux-next patches in
syzkaller testing.
And this issue was reproduced in next-20240812 also.

Bisected and found related commit in Intel internal repo:
558abc7e3f89 perf: Fix event_function_call() locking

After revert above commit on top of Intel internal kernel, this issue
was gone.

All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240812_194918_event_function_call
Syzkaller repro code: https://github.com/xupengfe/syzkaller_logs/blob/main/240812_194918_event_function_call/repro.c
Syzkaller repro syscall steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240812_194918_event_function_call/repro.prog
Syzkaller analysis report: https://github.com/xupengfe/syzkaller_logs/blob/main/240812_194918_event_function_call/repro.report
Kconfig(make olddefconfig): https://github.com/xupengfe/syzkaller_logs/blob/main/240812_194918_event_function_call/kconfig_origin
Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240812_194918_event_function_call/bisect_info.log
Issue bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240812_194918_event_function_call/bzImage_dcdef334e8852ca433e908db67744965101f3fc1.tar.gz
Issue dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240812_194918_event_function_call/dcdef334e8852ca433e908db67744965101f3fc1_dmesg.log

"
[   17.071069] BUG: using smp_processor_id() in preemptible [00000000] code: repro/726
[   17.071875] caller is debug_smp_processor_id+0x20/0x30
[   17.072369] CPU: 0 UID: 0 PID: 726 Comm: repro Not tainted 6.11.0-rc3-next-20240812-dcdef334e885+ #1
[   17.073225] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[   17.074205] Call Trace:
[   17.074436]  <TASK>
[   17.074646]  dump_stack_lvl+0x121/0x150
[   17.075005]  dump_stack+0x19/0x20
[   17.075313]  check_preemption_disabled+0x168/0x180
[   17.075753]  debug_smp_processor_id+0x20/0x30
[   17.076156]  event_function_call+0xd7/0x5c0
[   17.076520]  ? __pfx___perf_remove_from_context+0x10/0x10
[   17.076986]  ? __this_cpu_preempt_check+0x21/0x30
[   17.077404]  ? __pfx_event_function_call+0x10/0x10
[   17.077850]  ? __this_cpu_preempt_check+0x21/0x30
[   17.078276]  ? _raw_spin_unlock_irq+0x2c/0x60
[   17.078681]  ? lockdep_hardirqs_on+0x89/0x110
[   17.079084]  perf_remove_from_context+0xf7/0x1d0
[   17.079506]  perf_event_release_kernel+0x186/0x870
[   17.079944]  ? lock_is_held_type+0xef/0x150
[   17.080326]  ? locks_remove_file+0x3bb/0x5e0
[   17.080727]  ? __pfx_perf_event_release_kernel+0x10/0x10
[   17.081202]  ? __this_cpu_preempt_check+0x21/0x30
[   17.081634]  ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30
[   17.082116]  ? evm_file_release+0x193/0x1f0
[   17.082502]  ? __pfx_perf_release+0x10/0x10
[   17.082880]  perf_release+0x40/0x60
[   17.083203]  __fput+0x426/0xbc0
[   17.083509]  ____fput+0x1f/0x30
[   17.083806]  task_work_run+0x19c/0x2b0
[   17.084159]  ? __pfx_task_work_run+0x10/0x10
[   17.084553]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[   17.085036]  ? switch_task_namespaces+0xc6/0x110
[   17.085463]  do_exit+0xb19/0x2aa0
[   17.085778]  ? lock_release+0x441/0x870
[   17.086138]  ? __pfx_do_exit+0x10/0x10
[   17.086485]  ? __this_cpu_preempt_check+0x21/0x30
[   17.087007]  ? _raw_spin_unlock_irq+0x2c/0x60
[   17.087219]  ? lockdep_hardirqs_on+0x89/0x110
[   17.087426]  do_group_exit+0xe4/0x2c0
[   17.087607]  __x64_sys_exit_group+0x4d/0x60
[   17.087799]  x64_sys_call+0x20c4/0x20d0
[   17.087974]  do_syscall_64+0x6d/0x140
[   17.088144]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   17.088374] RIP: 0033:0x7f6c5a518a4d
[   17.088541] Code: Unable to access opcode bytes at 0x7f6c5a518a23.
[   17.088832] RSP: 002b:00007fffca44fe68 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
[   17.089166] RAX: ffffffffffffffda RBX: 00007f6c5a5f69e0 RCX: 00007f6c5a518a4d
[   17.089476] RDX: 00000000000000e7 RSI: ffffffffffffff80 RDI: 0000000000000000
[   17.089790] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000020
[   17.090100] R10: 00007fffca44fd10 R11: 0000000000000246 R12: 00007f6c5a5f69e0
[   17.090411] R13: 00007f6c5a5fbf00 R14: 0000000000000001 R15: 00007f6c5a5fbee8
[   17.090740]  </TASK>
[   17.162307] BUG: using smp_processor_id() in preemptible [00000000] code: repro/727
[   17.163284] caller is debug_smp_processor_id+0x20/0x30
[   17.163765] CPU: 0 UID: 0 PID: 727 Comm: repro Not tainted 6.11.0-rc3-next-20240812-dcdef334e885+ #1
[   17.164571] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[   17.165606] Call Trace:
[   17.165840]  <TASK>
[   17.166045]  dump_stack_lvl+0x121/0x150
[   17.166407]  dump_stack+0x19/0x20
[   17.166732]  check_preemption_disabled+0x168/0x180
[   17.166944]  debug_smp_processor_id+0x20/0x30
[   17.167153]  event_function_call+0xd7/0x5c0
[   17.167355]  ? __pfx___perf_remove_from_context+0x10/0x10
[   17.167611]  ? __this_cpu_preempt_check+0x21/0x30
[   17.167835]  ? __pfx_event_function_call+0x10/0x10
[   17.168072]  ? __this_cpu_preempt_check+0x21/0x30
[   17.168297]  ? _raw_spin_unlock_irq+0x2c/0x60
[   17.168505]  ? lockdep_hardirqs_on+0x89/0x110
[   17.168718]  perf_remove_from_context+0xf7/0x1d0
[   17.168939]  perf_event_release_kernel+0x186/0x870
[   17.169164]  ? lock_is_held_type+0xef/0x150
[   17.169362]  ? locks_remove_file+0x3bb/0x5e0
[   17.169546]  ? __pfx_perf_event_release_kernel+0x10/0x10
[   17.169791]  ? __this_cpu_preempt_check+0x21/0x30
[   17.169994]  ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30
[   17.170219]  ? evm_file_release+0x193/0x1f0
[   17.170421]  ? __pfx_perf_release+0x10/0x10
[   17.170620]  perf_release+0x40/0x60
[   17.170790]  __fput+0x426/0xbc0
[   17.170938]  ____fput+0x1f/0x30
[   17.171082]  task_work_run+0x19c/0x2b0
[   17.171271]  ? __pfx_task_work_run+0x10/0x10
[   17.171478]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[   17.171732]  ? switch_task_namespaces+0xc6/0x110
[   17.171957]  do_exit+0xb19/0x2aa0
[   17.172122]  ? lock_release+0x441/0x870
[   17.172314]  ? __pfx_do_exit+0x10/0x10
[   17.172476]  ? __this_cpu_preempt_check+0x21/0x30
[   17.172700]  ? _raw_spin_unlock_irq+0x2c/0x60
[   17.172910]  ? lockdep_hardirqs_on+0x89/0x110
[   17.173126]  do_group_exit+0xe4/0x2c0
[   17.173307]  __x64_sys_exit_group+0x4d/0x60
[   17.173509]  x64_sys_call+0x20c4/0x20d0
[   17.173679]  do_syscall_64+0x6d/0x140
[   17.173854]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   17.174091] RIP: 0033:0x7fc803b18a4d
[   17.174245] Code: Unable to access opcode bytes at 0x7fc803b18a23.
[   17.174522] RSP: 002b:00007ffd949207a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
[   17.174892] RAX: ffffffffffffffda RBX: 00007fc803bf69e0 RCX: 00007fc803b18a4d
[   17.175208] RDX: 00000000000000e7 RSI: ffffffffffffff80 RDI: 0000000000000000
[   17.175526] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000020
[   17.175849] R10: 00007ffd94920650 R11: 0000000000000246 R12: 00007fc803bf69e0
[   17.176170] R13: 00007fc803bfbf00 R14: 0000000000000001 R15: 00007fc803bfbee8
[   17.176508]  </TASK>
"

I hope it's helpful.

Thanks!

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh  // it needs qemu-system-x86_64 and I used v7.1.0
  // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
  // You could change the bzImage_xxx as you want
  // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage           //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install

Best Regards,
Thanks!


On 2024-08-08 at 10:32:08 -0000, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the perf/core branch of tip:
> 
> Commit-ID:     558abc7e3f895049faa46b08656be4c60dc6e9fd
> Gitweb:        https://git.kernel.org/tip/558abc7e3f895049faa46b08656be4c60dc6e9fd
> Author:        Peter Zijlstra <peterz@infradead.org>
> AuthorDate:    Wed, 07 Aug 2024 13:29:27 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Thu, 08 Aug 2024 12:27:31 +02:00
> 
> perf: Fix event_function_call() locking
> 
> All the event_function/@func call context already uses perf_ctx_lock()
> except for the !ctx->is_active case. Make it all consistent.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
> Reviewed-by: Namhyung Kim <namhyung@kernel.org>
> Link: https://lore.kernel.org/r/20240807115550.138301094@infradead.org
> ---
>  kernel/events/core.c |  9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index eb03c9a..ab49dea 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -263,6 +263,7 @@ unlock:
>  static void event_function_call(struct perf_event *event, event_f func, void *data)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>  	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
>  	struct event_function_struct efs = {
>  		.event = event,
> @@ -291,22 +292,22 @@ again:
>  	if (!task_function_call(task, event_function, &efs))
>  		return;
>  
> -	raw_spin_lock_irq(&ctx->lock);
> +	perf_ctx_lock(cpuctx, ctx);
>  	/*
>  	 * Reload the task pointer, it might have been changed by
>  	 * a concurrent perf_event_context_sched_out().
>  	 */
>  	task = ctx->task;
>  	if (task == TASK_TOMBSTONE) {
> -		raw_spin_unlock_irq(&ctx->lock);
> +		perf_ctx_unlock(cpuctx, ctx);
>  		return;
>  	}
>  	if (ctx->is_active) {
> -		raw_spin_unlock_irq(&ctx->lock);
> +		perf_ctx_unlock(cpuctx, ctx);
>  		goto again;
>  	}
>  	func(event, NULL, ctx, data);
> -	raw_spin_unlock_irq(&ctx->lock);
> +	perf_ctx_unlock(cpuctx, ctx);
>  }
>  
>  /*

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [tip: perf/core] perf: Fix event_function_call() locking
  2024-08-13  1:34     ` Pengfei Xu
@ 2024-08-13 15:19       ` Naresh Kamboju
  2024-08-13 18:28         ` Namhyung Kim
  0 siblings, 1 reply; 20+ messages in thread
From: Naresh Kamboju @ 2024-08-13 15:19 UTC (permalink / raw)
  To: pengfei.xu
  Cc: kan.liang, linux-kernel, linux-tip-commits, namhyung, peterz,
	syzkaller-bugs, x86, lkft-triage, dan.carpenter, anders.roxell,
	arnd, Linux Kernel Functional Testing

While running LTP test cases splice07 and perf_event_open01 found following
kernel BUG running on arm64 device juno-r2 and qemu-arm64 on the Linux
next-20240812 and next-20240813 tag.

  GOOD: next-20240809
  BAD: next-20240812

Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>

Test log:
--------
[ 2278.760258] check_preemption_disabled: 15 callbacks suppressed
[ 2278.760282] BUG: using smp_processor_id() in preemptible [00000000] code: perf_event_open/111076
[ 2278.775032] caller is debug_smp_processor_id+0x20/0x30
[ 2278.780270] CPU: 5 UID: 0 PID: 111076 Comm: perf_event_open Not tainted 6.11.0-rc3-next-20240812 #1
[ 2278.789344] Hardware name: ARM Juno development board (r2) (DT)
[ 2278.795276] Call trace:
[ 2278.797724]  dump_backtrace+0x9c/0x128
[ 2278.801487]  show_stack+0x20/0x38
[ 2278.804812]  dump_stack_lvl+0xbc/0xd0
[ 2278.808487]  dump_stack+0x18/0x28
[ 2278.811811]  check_preemption_disabled+0xd8/0xf8
[ 2278.816446]  debug_smp_processor_id+0x20/0x30
[ 2278.820818]  event_function_call+0x54/0x168
[ 2278.825015]  _perf_event_enable+0x78/0xa8
[ 2278.829037]  perf_event_for_each_child+0x44/0xa0
[ 2278.833672]  _perf_ioctl+0x1bc/0xae0
[ 2278.837262]  perf_ioctl+0x58/0x90
[ 2278.840590]  __arm64_sys_ioctl+0xb4/0x100
[ 2278.844615]  invoke_syscall+0x50/0x120
[ 2278.848381]  el0_svc_common.constprop.0+0x48/0xf0
[ 2278.853103]  do_el0_svc+0x24/0x38
[ 2278.856432]  el0_svc+0x3c/0x108
[ 2278.859585]  el0t_64_sync_handler+0x120/0x130
[ 2278.863956]  el0t_64_sync+0x190/0x198
[ 2279.068732] BUG: using smp_processor_id() in preemptible [00000000] code: perf_event_open/111076
[ 2279.077570] caller is debug_smp_processor_id+0x20/0x30
[ 2279.082754] CPU: 1 UID: 0 PID: 111076 Comm: perf_event_open Not tainted 6.11.0-rc3-next-20240812 #1
[ 2279.091823] Hardware name: ARM Juno development board (r2) (DT)

Full test log:
---------
 - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20240813/testrun/24833616/suite/log-parser-test/test/check-kernel-bug/log
 - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20240813/testrun/24833616/suite/log-parser-test/tests/
 - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20240812/testrun/24821160/suite/log-parser-test/test/check-kernel-bug-483bde618da4ec98e33eefb5e26adeb267f80cc2461569605f3166ce12b3fe82/log

metadata:
  artifact-location: https://storage.tuxsuite.com/public/linaro/lkft/builds/2kXsz6nJO7pJ1nL4xGlKHYhiLx9/
  build-url: https://storage.tuxsuite.com/public/linaro/lkft/builds/2kXsz6nJO7pJ1nL4xGlKHYhiLx9/
  build_name: gcc-13-lkftconfig-debug
  git_describe: next-20240812
  git_repo: https://gitlab.com/Linaro/lkft/mirrors/next/linux-next
  git_sha: 9e6869691724b12e1f43655eeedc35fade38120c
  kernel-config: https://storage.tuxsuite.com/public/linaro/lkft/builds/2kXsz6nJO7pJ1nL4xGlKHYhiLx9/config
  kernel_version: 6.11.0-rc3
  toolchain: gcc-13

--
Linaro LKFT
https://lkft.linaro.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [tip: perf/core] perf: Fix event_function_call() locking
  2024-08-13 15:19       ` Naresh Kamboju
@ 2024-08-13 18:28         ` Namhyung Kim
  2024-08-13 21:02           ` Peter Zijlstra
  0 siblings, 1 reply; 20+ messages in thread
From: Namhyung Kim @ 2024-08-13 18:28 UTC (permalink / raw)
  To: Naresh Kamboju
  Cc: pengfei.xu, kan.liang, linux-kernel, linux-tip-commits, peterz,
	syzkaller-bugs, x86, lkft-triage, dan.carpenter, anders.roxell,
	arnd, Linux Kernel Functional Testing, Andrii Nakryiko

Hello,

On Tue, Aug 13, 2024 at 08:49:59PM +0530, Naresh Kamboju wrote:
> While running LTP test cases splice07 and perf_event_open01 found following
> kernel BUG running on arm64 device juno-r2 and qemu-arm64 on the Linux
> next-20240812 and next-20240813 tag.
> 
>   GOOD: next-20240809
>   BAD: next-20240812
> 
> Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
> 
> Test log:
> --------
> [ 2278.760258] check_preemption_disabled: 15 callbacks suppressed
> [ 2278.760282] BUG: using smp_processor_id() in preemptible [00000000] code: perf_event_open/111076
> [ 2278.775032] caller is debug_smp_processor_id+0x20/0x30
> [ 2278.780270] CPU: 5 UID: 0 PID: 111076 Comm: perf_event_open Not tainted 6.11.0-rc3-next-20240812 #1
> [ 2278.789344] Hardware name: ARM Juno development board (r2) (DT)
> [ 2278.795276] Call trace:
> [ 2278.797724]  dump_backtrace+0x9c/0x128
> [ 2278.801487]  show_stack+0x20/0x38
> [ 2278.804812]  dump_stack_lvl+0xbc/0xd0
> [ 2278.808487]  dump_stack+0x18/0x28
> [ 2278.811811]  check_preemption_disabled+0xd8/0xf8
> [ 2278.816446]  debug_smp_processor_id+0x20/0x30
> [ 2278.820818]  event_function_call+0x54/0x168
> [ 2278.825015]  _perf_event_enable+0x78/0xa8
> [ 2278.829037]  perf_event_for_each_child+0x44/0xa0
> [ 2278.833672]  _perf_ioctl+0x1bc/0xae0
> [ 2278.837262]  perf_ioctl+0x58/0x90
> [ 2278.840590]  __arm64_sys_ioctl+0xb4/0x100
> [ 2278.844615]  invoke_syscall+0x50/0x120
> [ 2278.848381]  el0_svc_common.constprop.0+0x48/0xf0
> [ 2278.853103]  do_el0_svc+0x24/0x38
> [ 2278.856432]  el0_svc+0x3c/0x108
> [ 2278.859585]  el0t_64_sync_handler+0x120/0x130
> [ 2278.863956]  el0t_64_sync+0x190/0x198
> [ 2279.068732] BUG: using smp_processor_id() in preemptible [00000000] code: perf_event_open/111076
> [ 2279.077570] caller is debug_smp_processor_id+0x20/0x30
> [ 2279.082754] CPU: 1 UID: 0 PID: 111076 Comm: perf_event_open Not tainted 6.11.0-rc3-next-20240812 #1
> [ 2279.091823] Hardware name: ARM Juno development board (r2) (DT)
> 
> Full test log:
> ---------
>  - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20240813/testrun/24833616/suite/log-parser-test/test/check-kernel-bug/log
>  - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20240813/testrun/24833616/suite/log-parser-test/tests/
>  - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20240812/testrun/24821160/suite/log-parser-test/test/check-kernel-bug-483bde618da4ec98e33eefb5e26adeb267f80cc2461569605f3166ce12b3fe82/log
> 
> metadata:
>   artifact-location: https://storage.tuxsuite.com/public/linaro/lkft/builds/2kXsz6nJO7pJ1nL4xGlKHYhiLx9/
>   build-url: https://storage.tuxsuite.com/public/linaro/lkft/builds/2kXsz6nJO7pJ1nL4xGlKHYhiLx9/
>   build_name: gcc-13-lkftconfig-debug
>   git_describe: next-20240812
>   git_repo: https://gitlab.com/Linaro/lkft/mirrors/next/linux-next
>   git_sha: 9e6869691724b12e1f43655eeedc35fade38120c
>   kernel-config: https://storage.tuxsuite.com/public/linaro/lkft/builds/2kXsz6nJO7pJ1nL4xGlKHYhiLx9/config
>   kernel_version: 6.11.0-rc3
>   toolchain: gcc-13

Thanks for the report, can you please check if it solves the problem?

Thanks,
Namhyung

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9893ba5e98aa..85204c2376fa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -298,13 +298,14 @@ static int event_function(void *info)
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+	struct perf_cpu_context *cpuctx;
 	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 	struct event_function_struct efs = {
 		.event = event,
 		.func = func,
 		.data = data,
 	};
+	unsigned long flags;
 
 	if (!event->parent) {
 		/*
@@ -327,22 +328,27 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
+	local_irq_save(flags);
+	cpuctx = this_cpu_ptr(&perf_cpu_context);
+
 	perf_ctx_lock(cpuctx, ctx);
 	/*
 	 * Reload the task pointer, it might have been changed by
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
-	if (task == TASK_TOMBSTONE) {
-		perf_ctx_unlock(cpuctx, ctx);
-		return;
-	}
+	if (task == TASK_TOMBSTONE)
+		goto out;
+
 	if (ctx->is_active) {
 		perf_ctx_unlock(cpuctx, ctx);
+		local_irq_restore(flags);
 		goto again;
 	}
 	func(event, NULL, ctx, data);
+out:
 	perf_ctx_unlock(cpuctx, ctx);
+	local_irq_restore(flags);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [tip: perf/core] perf: Fix event_function_call() locking
  2024-08-13 18:28         ` Namhyung Kim
@ 2024-08-13 21:02           ` Peter Zijlstra
  2024-08-14  2:35             ` Pengfei Xu
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2024-08-13 21:02 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Naresh Kamboju, pengfei.xu, kan.liang, linux-kernel,
	linux-tip-commits, syzkaller-bugs, x86, lkft-triage,
	dan.carpenter, anders.roxell, arnd,
	Linux Kernel Functional Testing, Andrii Nakryiko

On Tue, Aug 13, 2024 at 11:28:54AM -0700, Namhyung Kim wrote:

Duh, yeah.

> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 9893ba5e98aa..85204c2376fa 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -298,13 +298,14 @@ static int event_function(void *info)
>  static void event_function_call(struct perf_event *event, event_f func, void *data)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> -	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +	struct perf_cpu_context *cpuctx;
>  	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
>  	struct event_function_struct efs = {
>  		.event = event,
>  		.func = func,
>  		.data = data,
>  	};
> +	unsigned long flags;
>  
>  	if (!event->parent) {
>  		/*
> @@ -327,22 +328,27 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
>  	if (!task_function_call(task, event_function, &efs))
>  		return;
>  
> +	local_irq_save(flags);

This can just be local_irq_disable() though, seeing how the fingered
commit replaced raw_spin_lock_irq().

I'll queue the below...

---
Subject: perf: Really fix event_function_call() locking
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue Aug 13 22:55:11 CEST 2024

Commit 558abc7e3f89 ("perf: Fix event_function_call() locking") lost
IRQ disabling by mistake.

Fixes: 558abc7e3f89 ("perf: Fix event_function_call() locking")
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |   13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -298,8 +298,8 @@ static int event_function(void *info)
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+	struct perf_cpu_context *cpuctx;
 	struct event_function_struct efs = {
 		.event = event,
 		.func = func,
@@ -327,22 +327,25 @@ static void event_function_call(struct p
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
+	local_irq_disable();
+	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
 	/*
 	 * Reload the task pointer, it might have been changed by
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
-	if (task == TASK_TOMBSTONE) {
-		perf_ctx_unlock(cpuctx, ctx);
-		return;
-	}
+	if (task == TASK_TOMBSTONE)
+		goto unlock;
 	if (ctx->is_active) {
 		perf_ctx_unlock(cpuctx, ctx);
+		local_irq_enable();
 		goto again;
 	}
 	func(event, NULL, ctx, data);
+unlock:
 	perf_ctx_unlock(cpuctx, ctx);
+	local_irq_enable();
 }
 
 /*

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [tip: perf/core] perf: Fix event_function_call() locking
  2024-08-13 21:02           ` Peter Zijlstra
@ 2024-08-14  2:35             ` Pengfei Xu
  0 siblings, 0 replies; 20+ messages in thread
From: Pengfei Xu @ 2024-08-14  2:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Namhyung Kim, Naresh Kamboju, kan.liang, linux-kernel,
	linux-tip-commits, syzkaller-bugs, x86, lkft-triage,
	dan.carpenter, anders.roxell, arnd,
	Linux Kernel Functional Testing, Andrii Nakryiko

Hi Peter and Kim,

I tested this patch on top of 6.11.0-rc3-next-20240812.
This issue can not be reproduced in syzkaller reproducer.

Best Regards,
Thanks!

On 2024-08-13 at 23:02:09 +0200, Peter Zijlstra wrote:
> On Tue, Aug 13, 2024 at 11:28:54AM -0700, Namhyung Kim wrote:
> 
> Duh, yeah.
> 
> > ---
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 9893ba5e98aa..85204c2376fa 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -298,13 +298,14 @@ static int event_function(void *info)
> >  static void event_function_call(struct perf_event *event, event_f func, void *data)
> >  {
> >  	struct perf_event_context *ctx = event->ctx;
> > -	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> > +	struct perf_cpu_context *cpuctx;
> >  	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
> >  	struct event_function_struct efs = {
> >  		.event = event,
> >  		.func = func,
> >  		.data = data,
> >  	};
> > +	unsigned long flags;
> >  
> >  	if (!event->parent) {
> >  		/*
> > @@ -327,22 +328,27 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
> >  	if (!task_function_call(task, event_function, &efs))
> >  		return;
> >  
> > +	local_irq_save(flags);
> 
> This can just be local_irq_disable() though, seeing how the fingered
> commit replaced raw_spin_lock_irq().
> 
> I'll queue the below...
> 
> ---
> Subject: perf: Really fix event_function_call() locking
> From: Namhyung Kim <namhyung@kernel.org>
> Date: Tue Aug 13 22:55:11 CEST 2024
> 
> Commit 558abc7e3f89 ("perf: Fix event_function_call() locking") lost
> IRQ disabling by mistake.
> 
> Fixes: 558abc7e3f89 ("perf: Fix event_function_call() locking")
> Reported-by: Pengfei Xu <pengfei.xu@intel.com>
> Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/events/core.c |   13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -298,8 +298,8 @@ static int event_function(void *info)
>  static void event_function_call(struct perf_event *event, event_f func, void *data)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> -	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>  	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
> +	struct perf_cpu_context *cpuctx;
>  	struct event_function_struct efs = {
>  		.event = event,
>  		.func = func,
> @@ -327,22 +327,25 @@ static void event_function_call(struct p
>  	if (!task_function_call(task, event_function, &efs))
>  		return;
>  
> +	local_irq_disable();
> +	cpuctx = this_cpu_ptr(&perf_cpu_context);
>  	perf_ctx_lock(cpuctx, ctx);
>  	/*
>  	 * Reload the task pointer, it might have been changed by
>  	 * a concurrent perf_event_context_sched_out().
>  	 */
>  	task = ctx->task;
> -	if (task == TASK_TOMBSTONE) {
> -		perf_ctx_unlock(cpuctx, ctx);
> -		return;
> -	}
> +	if (task == TASK_TOMBSTONE)
> +		goto unlock;
>  	if (ctx->is_active) {
>  		perf_ctx_unlock(cpuctx, ctx);
> +		local_irq_enable();
>  		goto again;
>  	}
>  	func(event, NULL, ctx, data);
> +unlock:
>  	perf_ctx_unlock(cpuctx, ctx);
> +	local_irq_enable();
>  }
>  
>  /*

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2024-08-14  2:35 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
2024-08-13  1:34     ` Pengfei Xu
2024-08-13 15:19       ` Naresh Kamboju
2024-08-13 18:28         ` Namhyung Kim
2024-08-13 21:02           ` Peter Zijlstra
2024-08-14  2:35             ` Pengfei Xu
2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
2024-08-07 15:17   ` Liang, Kan
2024-08-07 19:09     ` Peter Zijlstra
2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
2024-08-08 10:32   ` [tip: perf/core] " tip-bot2 for Peter Zijlstra
2024-08-07 15:19 ` [PATCH 0/5] perf: Per PMU context reschedule and misc Liang, Kan
2024-08-07 18:54 ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox