Re: scheduler performance regression since v6.11

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Chris Mason <clm@meta.com>,
	linux-kernel@vger.kernel.org, Ingo Molnar <mingo@kernel.org>,
	vschneid@redhat.com, Juri Lelli <juri.lelli@gmail.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	hannes@cmpxchg.org
Subject: Re: scheduler performance regression since v6.11
Date: Wed, 21 May 2025 16:54:47 +0200	[thread overview]
Message-ID: <20250521145447.GA31726@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20250520193831.GB39944@noisy.programming.kicks-ass.net>

On Tue, May 20, 2025 at 09:38:31PM +0200, Peter Zijlstra wrote:
> On Tue, May 20, 2025 at 04:38:09PM +0200, Dietmar Eggemann wrote:
> 
> > 3840cbe24cf0 - sched: psi: fix bogus pressure spikes from aggregation race
> > 
> > With CONFIG_PSI enabled we call cpu_clock(cpu) now multiple times (up to
> > 4 times per task switch in my setup) in:
> > 
> > __schedule() -> psi_sched_switch() -> psi_task_switch() ->
> > psi_group_change().
> > 
> > There seems to be another/other v6.12 related patch(es) later which
> > cause(s) another 4% regression I yet have to discover.
> 
> Urgh, let me add this to the pile to look at. Thanks!

Does something like the compile tested only hackery below work?

---
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1396674fa722..98c50bd4ce76 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -768,30 +768,20 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
+#define for_each_group(iter, group) \
+	for (typeof(group) iter = group; iter; iter = iter->parent)
+
 static void psi_group_change(struct psi_group *group, int cpu,
 			     unsigned int clear, unsigned int set,
-			     bool wake_clock)
+			     u64 now, bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
 	u32 state_mask;
-	u64 now;
 
 	lockdep_assert_rq_held(cpu_rq(cpu));
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
-	/*
-	 * First we update the task counts according to the state
-	 * change requested through the @clear and @set bits.
-	 *
-	 * Then if the cgroup PSI stats accounting enabled, we
-	 * assess the aggregate resource states this CPU's tasks
-	 * have been in since the last change, and account any
-	 * SOME and FULL time these may have resulted in.
-	 */
-	write_seqcount_begin(&groupc->seq);
-	now = cpu_clock(cpu);
-
 	/*
 	 * Start with TSK_ONCPU, which doesn't have a corresponding
 	 * task count - it's just a boolean flag directly encoded in
@@ -843,7 +833,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 		groupc->state_mask = state_mask;
 
-		write_seqcount_end(&groupc->seq);
 		return;
 	}
 
@@ -864,8 +853,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 	groupc->state_mask = state_mask;
 
-	write_seqcount_end(&groupc->seq);
-
 	if (state_mask & group->rtpoll_states)
 		psi_schedule_rtpoll_work(group, 1, false);
 
@@ -901,6 +888,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
+	u64 now;
 
 	if (!task->pid)
 		return;
@@ -908,16 +896,31 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	psi_flags_change(task, clear, set);
 
 	group = task_psi_group(task);
-	do {
-		psi_group_change(group, cpu, clear, set, true);
-	} while ((group = group->parent));
+
+	for_each_group(tmp, group)
+		raw_write_seqcount_begin(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+
+	now = cpu_clock(cpu);
+
+	for_each_group(tmp, group) {
+		psi_group_change(tmp, cpu, clear, set, now, true);
+		raw_write_seqcount_end(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+	}
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep)
 {
-	struct psi_group *group, *common = NULL;
+	struct psi_group *prev_group = task_psi_group(prev);
+	struct psi_group *next_group = task_psi_group(next);
+	struct psi_group *common = NULL;
 	int cpu = task_cpu(prev);
+	u64 now;
+
+	if (prev->pid) {
+		for_each_group(tmp, prev_group)
+			raw_write_seqcount_begin(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+	}
 
 	if (next->pid) {
 		psi_flags_change(next, 0, TSK_ONCPU);
@@ -926,16 +929,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		 * ancestors with @prev, those will already have @prev's
 		 * TSK_ONCPU bit set, and we can stop the iteration there.
 		 */
-		group = task_psi_group(next);
-		do {
-			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
-			    PSI_ONCPU) {
-				common = group;
+
+		for_each_group(tmp, next_group) {
+			struct psi_group_cpu *groupc = per_cpu_ptr(tmp->pcpu, cpu);
+
+			if (groupc->state_mask & PSI_ONCPU) {
+				common = tmp;
 				break;
 			}
+			raw_write_seqcount_begin(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+		}
+	}
+
+	now = cpu_clock(cpu);
 
-			psi_group_change(group, cpu, 0, TSK_ONCPU, true);
-		} while ((group = group->parent));
+	if (next->pid) {
+		for_each_group(tmp, next_group) {
+			if (tmp == common)
+				break;
+			psi_group_change(tmp, cpu, 0, TSK_ONCPU, now, true);
+			raw_write_seqcount_end(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+		}
 	}
 
 	if (prev->pid) {
@@ -968,12 +982,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
 		psi_flags_change(prev, clear, set);
 
-		group = task_psi_group(prev);
-		do {
-			if (group == common)
+		for_each_group(tmp, prev_group) {
+			if (tmp == common)
 				break;
-			psi_group_change(group, cpu, clear, set, wake_clock);
-		} while ((group = group->parent));
+			psi_group_change(tmp, cpu, clear, set, now, wake_clock);
+			raw_write_seqcount_end(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+		}
 
 		/*
 		 * TSK_ONCPU is handled up to the common ancestor. If there are
@@ -983,8 +997,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		 */
 		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
 			clear &= ~TSK_ONCPU;
-			for (; group; group = group->parent)
-				psi_group_change(group, cpu, clear, set, wake_clock);
+			for_each_group(tmp, common) {
+				psi_group_change(tmp, cpu, clear, set, now, wake_clock);
+				raw_write_seqcount_end(&per_cpu_ptr(tmp->pcpu, cpu)->seq);
+			}
 		}
 	}
 }
@@ -1221,12 +1237,14 @@ void psi_cgroup_restart(struct psi_group *group)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-		struct rq_flags rf;
+		u64 now;
 
-		rq_lock_irq(rq, &rf);
-		psi_group_change(group, cpu, 0, 0, true);
-		rq_unlock_irq(rq, &rf);
+		guard(rq_lock_irq)(cpu_rq(cpu));
+
+		raw_write_seqcount_begin(&per_cpu_ptr(group->pcpu, cpu)->seq);
+		now = cpu_clock(cpu);
+		psi_group_change(group, cpu, 0, 0, now, true);
+		raw_write_seqcount_end(&per_cpu_ptr(group->pcpu, cpu)->seq);
 	}
 }
 #endif /* CONFIG_CGROUPS */

next prev parent reply	other threads:[~2025-05-21 14:54 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-07 23:13 scheduler performance regression since v6.11 Chris Mason
2025-05-09 19:49 ` Peter Zijlstra
2025-05-12 18:08   ` Peter Zijlstra
2025-05-12 19:39     ` Chris Mason
2025-05-12 22:35       ` Chris Mason
2025-05-13  7:15         ` Peter Zijlstra
2025-05-16 10:18         ` Peter Zijlstra
2025-05-20 14:38           ` Dietmar Eggemann
2025-05-20 14:53             ` Chris Mason
2025-05-21 13:59               ` Dietmar Eggemann
2025-05-21 14:32                 ` Chris Mason
2025-05-20 19:38             ` Peter Zijlstra
2025-05-21 14:02               ` Dietmar Eggemann
2025-05-21 15:02                 ` Peter Zijlstra
2025-05-21 19:00                   ` Peter Zijlstra
2025-05-21 14:54               ` Peter Zijlstra [this message]
2025-05-22  8:48                 ` Peter Zijlstra
2025-05-22 15:00                   ` Johannes Weiner
2025-05-23 15:40                     ` Peter Zijlstra
2025-05-23 12:27                   ` Dietmar Eggemann
2025-07-10 12:46                   ` [tip: sched/core] sched/psi: Optimize psi_group_change() cpu_clock() usage tip-bot2 for Peter Zijlstra

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:1396674fa72 dfblob:98c50bd4ce7 )
 OR (
bs:"Re: scheduler performance regression since v6.11" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250521145447.GA31726@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=clm@meta.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=juri.lelli@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.