[PATCH 0/5] perf: Per PMU context reschedule and misc

public inbox for linux-perf-users@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/5] perf: Per PMU context reschedule and misc
@ 2024-08-07 11:29 Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
                   ` (6 more replies)
  0 siblings, 7 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Hi,

This is 'fallout' from Namhyung posting his per-pmu ctx_resched() patches. It
started with me trying to clean up and get rid of corner cases, and then got
involved when Kan noted the time keeping issue.

Anyway, please review / test.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/5] perf: Optimize context reschedule for single PMU cases
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Currently re-scheduling a context will reschedule all active PMUs for
that context, even if it is known only a single event is added.

Namhyung reported that changing this to only reschedule the affected
PMU when possible provides significant performance gains under certain
conditions.

Therefore, allow partial context reschedules for a specific PMU, that
of the event modified.

While the patch looks somewhat noisy, it mostly just propagates a new
@pmu argument through the callchain and modifies the epc loop to only
pick the 'epc->pmu == @pmu' case.

Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |  164 +++++++++++++++++++++++++++------------------------
 1 file changed, 88 insertions(+), 76 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -685,30 +685,32 @@ do {									\
 	___p;								\
 })
 
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup)				\
+	list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+		if (_cgroup && !_epc->nr_cgroups)			\
+			continue;					\
+		else if (_pmu && _epc->pmu != _pmu)			\
+			continue;					\
+		else
+
 static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
 		perf_pmu_disable(pmu_ctx->pmu);
-	}
 }
 
 static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, NULL, cgroup)
 		perf_pmu_enable(pmu_ctx->pmu);
-	}
 }
 
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
 
 #ifdef CONFIG_CGROUP_PERF
 
@@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct ta
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_ctx_disable(&cpuctx->ctx, true);
 
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
 	/*
 	 * must not be done before ctxswout due
 	 * to update_cgrp_time_from_cpuctx() in
@@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct ta
 	 * perf_cgroup_set_timestamp() in ctx_sched_in()
 	 * to not have to pass task around
 	 */
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+	ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
 
 	perf_ctx_enable(&cpuctx->ctx, true);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
-				enum event_type_t event_type)
+			       struct pmu *pmu,
+			       enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 
@@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct pe
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, event_type);
+	ctx_sched_out(ctx, pmu, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
-				struct perf_event_context *ctx)
+				struct perf_event_context *ctx,
+				struct pmu *pmu)
 {
-	ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
 	if (ctx)
-		 ctx_sched_in(ctx, EVENT_PINNED);
-	ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
 	if (ctx)
-		 ctx_sched_in(ctx, EVENT_FLEXIBLE);
+		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
 }
 
 /*
@@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct p
  * event_type is a bit mask of the types of events involved. For CPU events,
  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
  */
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
 			struct perf_event_context *task_ctx,
-			enum event_type_t event_type)
+			struct pmu *pmu, enum event_type_t event_type)
 {
 	bool cpu_event = !!(event_type & EVENT_CPU);
+	struct perf_event_pmu_context *epc;
 
 	/*
 	 * If pinned groups are involved, flexible groups also need to be
@@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_
 
 	event_type &= EVENT_ALL;
 
-	perf_ctx_disable(&cpuctx->ctx, false);
+	for_each_epc(epc, &cpuctx->ctx, pmu, false)
+		perf_pmu_disable(epc->pmu);
+
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx, false);
-		task_ctx_sched_out(task_ctx, event_type);
+		for_each_epc(epc, task_ctx, pmu, false)
+			perf_pmu_disable(epc->pmu);
+
+		task_ctx_sched_out(task_ctx, pmu, event_type);
 	}
 
 	/*
@@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_
 	 *  - otherwise, do nothing more.
 	 */
 	if (cpu_event)
-		ctx_sched_out(&cpuctx->ctx, event_type);
+		ctx_sched_out(&cpuctx->ctx, pmu, event_type);
 	else if (event_type & EVENT_PINNED)
-		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
 
-	perf_event_sched_in(cpuctx, task_ctx);
+	perf_event_sched_in(cpuctx, task_ctx, pmu);
 
-	perf_ctx_enable(&cpuctx->ctx, false);
-	if (task_ctx)
-		perf_ctx_enable(task_ctx, false);
+	for_each_epc(epc, &cpuctx->ctx, pmu, false)
+		perf_pmu_enable(epc->pmu);
+
+	if (task_ctx) {
+		for_each_epc(epc, task_ctx, pmu, false)
+			perf_pmu_enable(epc->pmu);
+	}
 }
 
 void perf_pmu_resched(struct pmu *pmu)
@@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu)
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
 
 	perf_ctx_lock(cpuctx, task_ctx);
-	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+	ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
 	perf_ctx_unlock(cpuctx, task_ctx);
 }
 
@@ -2802,9 +2810,10 @@ static int  __perf_install_in_context(vo
 #endif
 
 	if (reprogram) {
-		ctx_sched_out(ctx, EVENT_TIME);
+		ctx_sched_out(ctx, NULL, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-		ctx_resched(cpuctx, task_ctx, get_event_type(event));
+		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+			    get_event_type(event));
 	} else {
 		add_event_to_ctx(event, ctx);
 	}
@@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct p
 		return;
 
 	if (ctx->is_active)
-		ctx_sched_out(ctx, EVENT_TIME);
+		ctx_sched_out(ctx, NULL, EVENT_TIME);
 
 	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 	perf_cgroup_event_enable(event, ctx);
@@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct p
 		return;
 
 	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 		return;
 	}
 
@@ -2966,7 +2975,7 @@ static void __perf_event_enable(struct p
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 		return;
 	}
 
@@ -2974,7 +2983,7 @@ static void __perf_event_enable(struct p
 	if (ctx->task)
 		WARN_ON_ONCE(task_ctx != ctx);
 
-	ctx_resched(cpuctx, task_ctx, get_event_type(event));
+	ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
 }
 
 /*
@@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct p
 	perf_pmu_enable(pmu);
 }
 
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
 static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
@@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context
 
 	is_active ^= ctx->is_active; /* changed bits */
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
+	for_each_epc(pmu_ctx, ctx, pmu, cgroup)
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
-	}
 }
 
 /*
@@ -3579,7 +3594,7 @@ perf_event_context_sched_out(struct task
 
 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
-		task_ctx_sched_out(ctx, EVENT_ALL);
+		task_ctx_sched_out(ctx, NULL, EVENT_ALL);
 
 		perf_ctx_enable(ctx, false);
 		raw_spin_unlock(&ctx->lock);
@@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct p
 			   merge_sched_in, &can_add_hw);
 }
 
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
-				struct perf_event_groups *groups,
-				bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+			       enum event_type_t event_type)
 {
-	struct perf_event_pmu_context *pmu_ctx;
-
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
-			continue;
-		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
-	}
-}
+	struct perf_event_context *ctx = pmu_ctx->ctx;
 
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
-			       struct pmu *pmu)
-{
-	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+	if (event_type & EVENT_PINNED)
+		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+	if (event_type & EVENT_FLEXIBLE)
+		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
 }
 
 static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
 	bool cgroup = event_type & EVENT_CGROUP;
 
@@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (is_active & EVENT_PINNED)
-		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+	if (is_active & EVENT_PINNED) {
+		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+			__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+	}
 
 	/* Then walk through the lower prio flexible groups */
-	if (is_active & EVENT_FLEXIBLE)
-		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+	if (is_active & EVENT_FLEXIBLE) {
+		for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+			__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+	}
 }
 
 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
 		perf_ctx_disable(&cpuctx->ctx, false);
-		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
 	}
 
-	perf_event_sched_in(cpuctx, ctx);
+	perf_event_sched_in(cpuctx, ctx, NULL);
 
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
@@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct p
 		update_context_time(&cpuctx->ctx);
 		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
 		rotate_ctx(&cpuctx->ctx, cpu_event);
-		__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+		__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
 	}
 
 	if (task_event)
 		rotate_ctx(task_epc->ctx, task_event);
 
 	if (task_event || (task_epc && cpu_event))
-		__pmu_ctx_sched_in(task_epc->ctx, pmu);
+		__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
 
 	perf_pmu_enable(pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(st
 
 	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, EVENT_TIME);
+	ctx_sched_out(ctx, NULL, EVENT_TIME);
 
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
@@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(st
 	 */
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
-		ctx_resched(cpuctx, ctx, event_type);
+		ctx_resched(cpuctx, ctx, NULL, event_type);
 	} else {
-		ctx_sched_in(ctx, EVENT_TIME);
+		ctx_sched_in(ctx, NULL, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(child_ctx, EVENT_ALL);
+	task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation
@@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(vo
 	struct perf_event *event;
 
 	raw_spin_lock(&ctx->lock);
-	ctx_sched_out(ctx, EVENT_TIME);
+	ctx_sched_out(ctx, NULL, EVENT_TIME);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
 	raw_spin_unlock(&ctx->lock);



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/5] perf: Extract a few helpers
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

The context time update code is repeated verbatim a few times.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |   39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2330,6 +2330,24 @@ group_sched_out(struct perf_event *group
 		event_sched_out(event, ctx);
 }
 
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx, false);
+	}
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
+}
+
 #define DETACH_GROUP	0x01UL
 #define DETACH_CHILD	0x02UL
 #define DETACH_DEAD	0x04UL
@@ -2349,10 +2367,7 @@ __perf_remove_from_context(struct perf_e
 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
 	unsigned long flags = (unsigned long)info;
 
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
-	}
+	ctx_time_update(cpuctx, ctx);
 
 	/*
 	 * Ensure event_sched_out() switches to OFF, at the very least
@@ -2437,12 +2452,8 @@ static void __perf_event_disable(struct
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return;
 
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
-
 	perf_pmu_disable(event->pmu_ctx->pmu);
+	ctx_time_update_event(ctx, event);
 
 	if (event == event->group_leader)
 		group_sched_out(event, ctx);
@@ -4529,10 +4540,7 @@ static void __perf_event_read(void *info
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active & EVENT_TIME) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
+	ctx_time_update_event(ctx, event);
 
 	perf_event_update_time(event);
 	if (data->group)
@@ -4732,10 +4740,7 @@ static int perf_event_read(struct perf_e
 		 * May read while context is not active (e.g., thread is
 		 * blocked), in that case we cannot update context time
 		 */
-		if (ctx->is_active & EVENT_TIME) {
-			update_context_time(ctx);
-			update_cgrp_time_from_event(event);
-		}
+		ctx_time_update_event(ctx, event);
 
 		perf_event_update_time(event);
 		if (group)



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 3/5] perf: Fix event_function_call() locking
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

All the event_function/@func call context already uses perf_ctx_lock()
except for the !ctx->is_active case. Make it all consistent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |    9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -263,6 +263,7 @@ static int event_function(void *info)
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 	struct event_function_struct efs = {
 		.event = event,
@@ -291,22 +292,22 @@ static void event_function_call(struct p
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
-	raw_spin_lock_irq(&ctx->lock);
+	perf_ctx_lock(cpuctx, ctx);
 	/*
 	 * Reload the task pointer, it might have been changed by
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
 	if (task == TASK_TOMBSTONE) {
-		raw_spin_unlock_irq(&ctx->lock);
+		perf_ctx_unlock(cpuctx, ctx);
 		return;
 	}
 	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
+		perf_ctx_unlock(cpuctx, ctx);
 		goto again;
 	}
 	func(event, NULL, ctx, data);
-	raw_spin_unlock_irq(&ctx->lock);
+	perf_ctx_unlock(cpuctx, ctx);
 }
 
 /*



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4/5] perf: Add context time freeze
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (2 preceding siblings ...)
  2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-07 15:17   ` Liang, Kan
  2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Many of the the context reschedule users are of the form:

  ctx_sched_out(.type = EVENT_TIME);
  ... modify context
  ctx_resched();

With the idea that the whole reschedule happens with a single
time-stamp, rather than with each ctx_sched_out() advancing time and
ctx_sched_in() re-starting time, creating a non-atomic experience.

However, Kan noticed that since this completely stops time, it
actually looses a bit of time between the stop and start. Worse, now
that we can do partial (per PMU) reschedules, the PMUs that are not
scheduled out still observe the time glitch.

Replace this with:

  ctx_time_freeze();
  ... modify context
  ctx_resched();

With the assumption that this happens in a perf_ctx_lock() /
perf_ctx_unlock() pair.

The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
perf_event_time_now() from observing a time wobble from not seeing
EVENT_TIME for a little while.

Additionally, this avoids loosing time between
ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
timestamp.

Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |  128 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 42 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, re
 	return data.ret;
 }
 
+enum event_type_t {
+	EVENT_FLEXIBLE	= 0x01,
+	EVENT_PINNED	= 0x02,
+	EVENT_TIME	= 0x04,
+	EVENT_FROZEN	= 0x08,
+	/* see ctx_resched() for details */
+	EVENT_CPU	= 0x10,
+	EVENT_CGROUP	= 0x20,
+
+	/* compound helpers */
+	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
+	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+	raw_spin_lock(&ctx->lock);
+	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 			  struct perf_event_context *ctx)
 {
-	raw_spin_lock(&cpuctx->ctx.lock);
+	__perf_ctx_lock(&cpuctx->ctx);
 	if (ctx)
-		raw_spin_lock(&ctx->lock);
+		__perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+	/*
+	 * If ctx_sched_in() didn't again set any ALL flags, clean up
+	 * after ctx_sched_out() by clearing is_active.
+	 */
+	if (ctx->is_active & EVENT_FROZEN) {
+		if (!(ctx->is_active & EVENT_ALL))
+			ctx->is_active = 0;
+		else
+			ctx->is_active &= ~EVENT_FROZEN;
+	}
+	raw_spin_unlock(&ctx->lock);
 }
 
 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 			    struct perf_event_context *ctx)
 {
 	if (ctx)
-		raw_spin_unlock(&ctx->lock);
-	raw_spin_unlock(&cpuctx->ctx.lock);
+		__perf_ctx_unlock(ctx);
+	__perf_ctx_unlock(&cpuctx->ctx);
 }
 
 #define TASK_TOMBSTONE ((void *)-1L)
@@ -370,16 +405,6 @@ static void event_function_local(struct
 	(PERF_SAMPLE_BRANCH_KERNEL |\
 	 PERF_SAMPLE_BRANCH_HV)
 
-enum event_type_t {
-	EVENT_FLEXIBLE = 0x1,
-	EVENT_PINNED = 0x2,
-	EVENT_TIME = 0x4,
-	/* see ctx_resched() for details */
-	EVENT_CPU = 0x8,
-	EVENT_CGROUP = 0x10,
-	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
 /*
  * perf_sched_events : >0 events exist
  */
@@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group
 }
 
 static inline void
-ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
 {
 	if (ctx->is_active & EVENT_TIME) {
+		if (ctx->is_active & EVENT_FROZEN)
+			return;
 		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, false);
+		update_cgrp_time_from_cpuctx(cpuctx, final);
 	}
 }
 
 static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	__ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+	ctx_time_update(cpuctx, ctx);
+	if (ctx->is_active & EVENT_TIME)
+		ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
 ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
 {
 	if (ctx->is_active & EVENT_TIME) {
+		if (ctx->is_active & EVENT_FROZEN)
+			return;
 		update_context_time(ctx);
 		update_cgrp_time_from_event(event);
 	}
@@ -2822,7 +2868,7 @@ static int  __perf_install_in_context(vo
 #endif
 
 	if (reprogram) {
-		ctx_sched_out(ctx, NULL, EVENT_TIME);
+		ctx_time_freeze(cpuctx, ctx);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
 			    get_event_type(event));
@@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct p
 	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
-	if (ctx->is_active)
-		ctx_sched_out(ctx, NULL, EVENT_TIME);
+	ctx_time_freeze(cpuctx, ctx);
 
 	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 	perf_cgroup_event_enable(event, ctx);
@@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct p
 	if (!ctx->is_active)
 		return;
 
-	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
+	if (!event_filter_match(event))
 		return;
-	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
 	 * then don't put it on unless the group is on.
 	 */
-	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 		return;
-	}
 
 	task_ctx = cpuctx->task_ctx;
 	if (ctx->task)
@@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
 	struct perf_event *event, *tmp;
 	struct pmu *pmu = pmu_ctx->pmu;
 
-	if (ctx->task && !ctx->is_active) {
+	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
 		struct perf_cpu_pmu_context *cpc;
 
 		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context
 	 *
 	 * would only update time for the pinned events.
 	 */
-	if (is_active & EVENT_TIME) {
-		/* update (and stop) ctx time */
-		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+	/*
+	 * CPU-release for the below ->is_active store,
+	 * see __load_acquire() in perf_event_time_now()
+	 */
+	barrier();
+	ctx->is_active &= ~event_type;
+
+	if (!(ctx->is_active & EVENT_ALL)) {
 		/*
-		 * CPU-release for the below ->is_active store,
-		 * see __load_acquire() in perf_event_time_now()
+		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+		 * does not observe a hole. perf_ctx_unlock() will clean up.
 		 */
-		barrier();
+		if (ctx->is_active & EVENT_FROZEN)
+			ctx->is_active &= EVENT_TIME_FROZEN;
+		else
+			ctx->is_active = 0;
 	}
 
-	ctx->is_active &= ~event_type;
-	if (!(ctx->is_active & EVENT_ALL))
-		ctx->is_active = 0;
-
 	if (ctx->task) {
 		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
-		if (!ctx->is_active)
+		if (!(ctx->is_active & EVENT_ALL))
 			cpuctx->task_ctx = NULL;
 	}
 
@@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *
 
 	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
-		if (!is_active)
+		if (!(is_active & EVENT_ALL))
 			cpuctx->task_ctx = ctx;
 		else
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(st
 
 	cpuctx = this_cpu_ptr(&perf_cpu_context);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, NULL, EVENT_TIME);
+	ctx_time_freeze(cpuctx, ctx);
 
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
@@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(st
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, NULL, event_type);
-	} else {
-		ctx_sched_in(ctx, NULL, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/5] perf: Add context time freeze
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
@ 2024-08-07 15:17   ` Liang, Kan
  2024-08-07 19:09     ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Liang, Kan @ 2024-08-07 15:17 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: acme, namhyung, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, linux-perf-users, linux-kernel



On 2024-08-07 7:29 a.m., Peter Zijlstra wrote:
> Many of the the context reschedule users are of the form:
> 
>   ctx_sched_out(.type = EVENT_TIME);
>   ... modify context
>   ctx_resched();
> 
> With the idea that the whole reschedule happens with a single
> time-stamp, rather than with each ctx_sched_out() advancing time and
> ctx_sched_in() re-starting time, creating a non-atomic experience.
> 
> However, Kan noticed that since this completely stops time, it
> actually looses a bit of time between the stop and start. Worse, now
> that we can do partial (per PMU) reschedules, the PMUs that are not
> scheduled out still observe the time glitch.
> 
> Replace this with:
> 
>   ctx_time_freeze();
>   ... modify context
>   ctx_resched();
> 
> With the assumption that this happens in a perf_ctx_lock() /
> perf_ctx_unlock() pair.
> 
> The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
> ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
> perf_event_time_now() from observing a time wobble from not seeing
> EVENT_TIME for a little while.
> 
> Additionally, this avoids loosing time between
> ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
> timestamp.
> 
> Reported-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/events/core.c |  128 ++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 86 insertions(+), 42 deletions(-)
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, re
>  	return data.ret;
>  }
>  
> +enum event_type_t {
> +	EVENT_FLEXIBLE	= 0x01,
> +	EVENT_PINNED	= 0x02,
> +	EVENT_TIME	= 0x04,
> +	EVENT_FROZEN	= 0x08,
> +	/* see ctx_resched() for details */
> +	EVENT_CPU	= 0x10,
> +	EVENT_CGROUP	= 0x20,
> +
> +	/* compound helpers */
> +	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
> +	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
> +};
> +
> +static inline void __perf_ctx_lock(struct perf_event_context *ctx)
> +{
> +	raw_spin_lock(&ctx->lock);
> +	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
> +}
> +
>  static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
>  			  struct perf_event_context *ctx)
>  {
> -	raw_spin_lock(&cpuctx->ctx.lock);
> +	__perf_ctx_lock(&cpuctx->ctx);
>  	if (ctx)
> -		raw_spin_lock(&ctx->lock);
> +		__perf_ctx_lock(ctx);
> +}
> +
> +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> +{
> +	/*
> +	 * If ctx_sched_in() didn't again set any ALL flags, clean up
> +	 * after ctx_sched_out() by clearing is_active.
> +	 */
> +	if (ctx->is_active & EVENT_FROZEN) {
> +		if (!(ctx->is_active & EVENT_ALL))

Nit:
It may be better to add a macro/inline function to replace all the
(ctx->is_active & EVENT_ALL) check? For example,

+static inline bool perf_ctx_has_active_events(struct perf_event_context
*ctx)
+{
+	return ctx->is_active & EVENT_ALL;
+}
...
+	if (ctx->is_active & EVENT_FROZEN) {
+		if (!perf_ctx_has_active_events(ctx))
+			ctx->is_active = 0;
+		else
+			ctx->is_active &= ~EVENT_FROZEN;

It can tell very straightforwardly that we want to clear all flags if
there is no active event.
The EVENT_ALL may bring confusion. It actually means all events, not all
event types. The developer may have to go to the define and figure out
what exactly the EVENT_ALL includes.

Thanks,
Kan

> +			ctx->is_active = 0;
> +		else
> +			ctx->is_active &= ~EVENT_FROZEN;
> +	}
> +	raw_spin_unlock(&ctx->lock);
>  }
>  
>  static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
>  			    struct perf_event_context *ctx)
>  {
>  	if (ctx)
> -		raw_spin_unlock(&ctx->lock);
> -	raw_spin_unlock(&cpuctx->ctx.lock);
> +		__perf_ctx_unlock(ctx);
> +	__perf_ctx_unlock(&cpuctx->ctx);
>  }
>  
>  #define TASK_TOMBSTONE ((void *)-1L)
> @@ -370,16 +405,6 @@ static void event_function_local(struct
>  	(PERF_SAMPLE_BRANCH_KERNEL |\
>  	 PERF_SAMPLE_BRANCH_HV)
>  
> -enum event_type_t {
> -	EVENT_FLEXIBLE = 0x1,
> -	EVENT_PINNED = 0x2,
> -	EVENT_TIME = 0x4,
> -	/* see ctx_resched() for details */
> -	EVENT_CPU = 0x8,
> -	EVENT_CGROUP = 0x10,
> -	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> -};
> -
>  /*
>   * perf_sched_events : >0 events exist
>   */
> @@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group
>  }
>  
>  static inline void
> -ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
>  {
>  	if (ctx->is_active & EVENT_TIME) {
> +		if (ctx->is_active & EVENT_FROZEN)
> +			return;
>  		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, false);
> +		update_cgrp_time_from_cpuctx(cpuctx, final);
>  	}
>  }
>  
>  static inline void
> +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> +	__ctx_time_update(cpuctx, ctx, false);
> +}
> +
> +/*
> + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
> + */
> +static inline void
> +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> +	ctx_time_update(cpuctx, ctx);
> +	if (ctx->is_active & EVENT_TIME)
> +		ctx->is_active |= EVENT_FROZEN;
> +}
> +
> +static inline void
>  ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
>  {
>  	if (ctx->is_active & EVENT_TIME) {
> +		if (ctx->is_active & EVENT_FROZEN)
> +			return;
>  		update_context_time(ctx);
>  		update_cgrp_time_from_event(event);
>  	}
> @@ -2822,7 +2868,7 @@ static int  __perf_install_in_context(vo
>  #endif
>  
>  	if (reprogram) {
> -		ctx_sched_out(ctx, NULL, EVENT_TIME);
> +		ctx_time_freeze(cpuctx, ctx);
>  		add_event_to_ctx(event, ctx);
>  		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
>  			    get_event_type(event));
> @@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct p
>  	    event->state <= PERF_EVENT_STATE_ERROR)
>  		return;
>  
> -	if (ctx->is_active)
> -		ctx_sched_out(ctx, NULL, EVENT_TIME);
> +	ctx_time_freeze(cpuctx, ctx);
>  
>  	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
>  	perf_cgroup_event_enable(event, ctx);
> @@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct p
>  	if (!ctx->is_active)
>  		return;
>  
> -	if (!event_filter_match(event)) {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
> +	if (!event_filter_match(event))
>  		return;
> -	}
>  
>  	/*
>  	 * If the event is in a group and isn't the group leader,
>  	 * then don't put it on unless the group is on.
>  	 */
> -	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
> +	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
>  		return;
> -	}
>  
>  	task_ctx = cpuctx->task_ctx;
>  	if (ctx->task)
> @@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
>  	struct perf_event *event, *tmp;
>  	struct pmu *pmu = pmu_ctx->pmu;
>  
> -	if (ctx->task && !ctx->is_active) {
> +	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
>  		struct perf_cpu_pmu_context *cpc;
>  
>  		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> @@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context
>  	 *
>  	 * would only update time for the pinned events.
>  	 */
> -	if (is_active & EVENT_TIME) {
> -		/* update (and stop) ctx time */
> -		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> +	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
> +
> +	/*
> +	 * CPU-release for the below ->is_active store,
> +	 * see __load_acquire() in perf_event_time_now()
> +	 */
> +	barrier();
> +	ctx->is_active &= ~event_type;
> +
> +	if (!(ctx->is_active & EVENT_ALL)) {
>  		/*
> -		 * CPU-release for the below ->is_active store,
> -		 * see __load_acquire() in perf_event_time_now()
> +		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
> +		 * does not observe a hole. perf_ctx_unlock() will clean up.
>  		 */
> -		barrier();
> +		if (ctx->is_active & EVENT_FROZEN)
> +			ctx->is_active &= EVENT_TIME_FROZEN;
> +		else
> +			ctx->is_active = 0;
>  	}
>  
> -	ctx->is_active &= ~event_type;
> -	if (!(ctx->is_active & EVENT_ALL))
> -		ctx->is_active = 0;
> -
>  	if (ctx->task) {
>  		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> -		if (!ctx->is_active)
> +		if (!(ctx->is_active & EVENT_ALL))
>  			cpuctx->task_ctx = NULL;
>  	}
>  
> @@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *
>  
>  	ctx->is_active |= (event_type | EVENT_TIME);
>  	if (ctx->task) {
> -		if (!is_active)
> +		if (!(is_active & EVENT_ALL))
>  			cpuctx->task_ctx = ctx;
>  		else
>  			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> @@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(st
>  
>  	cpuctx = this_cpu_ptr(&perf_cpu_context);
>  	perf_ctx_lock(cpuctx, ctx);
> -	ctx_sched_out(ctx, NULL, EVENT_TIME);
> +	ctx_time_freeze(cpuctx, ctx);
>  
>  	list_for_each_entry(event, &ctx->event_list, event_entry) {
>  		enabled |= event_enable_on_exec(event, ctx);
> @@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(st
>  	if (enabled) {
>  		clone_ctx = unclone_ctx(ctx);
>  		ctx_resched(cpuctx, ctx, NULL, event_type);
> -	} else {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
>  	}
>  	perf_ctx_unlock(cpuctx, ctx);
>  
> 
> 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/5] perf: Add context time freeze
  2024-08-07 15:17   ` Liang, Kan
@ 2024-08-07 19:09     ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 19:09 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, linux-perf-users, linux-kernel

On Wed, Aug 07, 2024 at 11:17:18AM -0400, Liang, Kan wrote:

> > +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> > +{
> > +	/*
> > +	 * If ctx_sched_in() didn't again set any ALL flags, clean up
> > +	 * after ctx_sched_out() by clearing is_active.
> > +	 */
> > +	if (ctx->is_active & EVENT_FROZEN) {
> > +		if (!(ctx->is_active & EVENT_ALL))
> 
> Nit:
> It may be better to add a macro/inline function to replace all the
> (ctx->is_active & EVENT_ALL) check? For example,
> 
> +static inline bool perf_ctx_has_active_events(struct perf_event_context
> *ctx)
> +{
> +	return ctx->is_active & EVENT_ALL;
> +}
> ...
> +	if (ctx->is_active & EVENT_FROZEN) {
> +		if (!perf_ctx_has_active_events(ctx))
> +			ctx->is_active = 0;
> +		else
> +			ctx->is_active &= ~EVENT_FROZEN;
> 
> It can tell very straightforwardly that we want to clear all flags if
> there is no active event.
> The EVENT_ALL may bring confusion. It actually means all events, not all
> event types. The developer may have to go to the define and figure out
> what exactly the EVENT_ALL includes.

I'll push this on the todo list. I'm running short of time before I'm
taking a holiday and need to also spend time looking at the sched_ext
thing.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out()
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (3 preceding siblings ...)
  2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
@ 2024-08-07 11:29 ` Peter Zijlstra
  2024-08-07 15:19 ` [PATCH 0/5] perf: Per PMU context reschedule and misc Liang, Kan
  2024-08-07 18:54 ` Namhyung Kim
  6 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2024-08-07 11:29 UTC (permalink / raw)
  To: mingo
  Cc: peterz, acme, namhyung, mark.rutland, alexander.shishkin, jolsa,
	irogers, adrian.hunter, kan.liang, linux-perf-users, linux-kernel

There is is no point in doing the perf_pmu_disable() dance just to do
nothing. This happens for ctx_sched_out(.type = EVENT_TIME) for
instance.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3304,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
 		cpc->task_epc = NULL;
 	}
 
-	if (!event_type)
+	if (!(event_type & EVENT_ALL))
 		return;
 
 	perf_pmu_disable(pmu);



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/5] perf: Per PMU context reschedule and misc
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (4 preceding siblings ...)
  2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
@ 2024-08-07 15:19 ` Liang, Kan
  2024-08-07 18:54 ` Namhyung Kim
  6 siblings, 0 replies; 10+ messages in thread
From: Liang, Kan @ 2024-08-07 15:19 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: acme, namhyung, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, linux-perf-users, linux-kernel



On 2024-08-07 7:29 a.m., Peter Zijlstra wrote:
> Hi,
> 
> This is 'fallout' from Namhyung posting his per-pmu ctx_resched() patches. It
> started with me trying to clean up and get rid of corner cases, and then got
> involved when Kan noted the time keeping issue.
> 
> Anyway, please review / test.
> 

Except for the tiny nit, the patch series looks good to me.

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>

Thanks,
Kan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/5] perf: Per PMU context reschedule and misc
  2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
                   ` (5 preceding siblings ...)
  2024-08-07 15:19 ` [PATCH 0/5] perf: Per PMU context reschedule and misc Liang, Kan
@ 2024-08-07 18:54 ` Namhyung Kim
  6 siblings, 0 replies; 10+ messages in thread
From: Namhyung Kim @ 2024-08-07 18:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, kan.liang, linux-perf-users, linux-kernel

Hi Peter,

On Wed, Aug 7, 2024 at 4:56 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Hi,
>
> This is 'fallout' from Namhyung posting his per-pmu ctx_resched() patches. It
> started with me trying to clean up and get rid of corner cases, and then got
> involved when Kan noted the time keeping issue.
>
> Anyway, please review / test.

It works blazingly fast!

  # ./stress-pmu
  delta: 0.000307 sec (3 usec/op)

I found a problem with my patch that it called __pmu_ctx_sched_out() for
nothing (I guess is_active only has EVENT_TIME).  I thought ctx_sched_out()
would stop if it doesn't change EVENT_ALL but it iterated all PMUs anyway.

But with this change we don't need ctx_sched_out(EVENT_TIME) anymore.

Reviewed-by: Namhyung Kim <namhyung@kernel.org>

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-08-07 19:09 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-07 11:29 [PATCH 0/5] perf: Per PMU context reschedule and misc Peter Zijlstra
2024-08-07 11:29 ` [PATCH 1/5] perf: Optimize context reschedule for single PMU cases Peter Zijlstra
2024-08-07 11:29 ` [PATCH 2/5] perf: Extract a few helpers Peter Zijlstra
2024-08-07 11:29 ` [PATCH 3/5] perf: Fix event_function_call() locking Peter Zijlstra
2024-08-07 11:29 ` [PATCH 4/5] perf: Add context time freeze Peter Zijlstra
2024-08-07 15:17   ` Liang, Kan
2024-08-07 19:09     ` Peter Zijlstra
2024-08-07 11:29 ` [PATCH 5/5] perf: Optimize __pmu_ctx_sched_out() Peter Zijlstra
2024-08-07 15:19 ` [PATCH 0/5] perf: Per PMU context reschedule and misc Liang, Kan
2024-08-07 18:54 ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox