public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions
@ 2026-02-24  8:32 Thomas Gleixner
  2026-02-24  9:35 ` Christian Loehle
  0 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2026-02-24  8:32 UTC (permalink / raw)
  To: LKML; +Cc: Peter Zijlstra, Rafael J. Wysocki, Frederic Weisbecker

During a hackbench run with a fully loaded machine CPUs go briefly idle
when they run out of tasks, which is expected. What's not expected are
pointless NOHZ transitions like this:

       hackbench-1915    [001] d..2.    84.086755: sched_switch: prev_comm=hackbench prev_pid=1915 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
 1)       <idle>-0       [001] dn.2.    84.086757: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=305340000000 softexpires=305340000000 mode=ABS|PINNED|HARD was_armed=1
          <idle>-0       [001] dn.2.    84.086757: hrtimer_rearm: next_event=83885523974 deferred=0
 2)       <idle>-0       [001] dN.2.    84.086761: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=82950000000 softexpires=82950000000 mode=ABS|PINNED|HARD was_armed=1
          <idle>-0       [001] dN.2.    84.086761: hrtimer_rearm: next_event=82950000000 deferred=0
          <idle>-0       [001] d..2.    84.086767: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=hackbench next_pid=2138 next_prio=120
       hackbench-2138    [001] d..2.    84.086779: sched_switch: prev_comm=hackbench prev_pid=2138 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120

#1 switches to NOHZ mode targeting the next expiring timer and #2
switches back to tick mode a whopping 4us later.

This happens with both TEO and MENU governors in a VM guest. That's not
only pointless it's also a performance issue as each rearm of the timer
implies a VM exit.

Keep track of the idle time with a moving average and check it for being
larger than TICK_NSEC in can_stop_idle_tick(). That cures this behaviour
while still allowing the system to go into long idle sleeps once the
work load stopped.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
 kernel/time/tick-sched.c |   20 +++++++++++++++++---
 kernel/time/tick-sched.h |    9 +++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,6 +751,16 @@ static void tick_nohz_update_jiffies(kti
 	touch_softlockup_watchdog_sched();
 }
 
+static void tick_nohz_update_idle_duration(struct tick_sched *ts, ktime_t now)
+{
+	ktime_t delta = now - ts->idle_dur_entry;
+	unsigned int idx = ts->idle_dur_idx;
+
+	ts->idle_dur_sum += delta - ts->idle_dur[idx];
+	ts->idle_dur[idx] = delta;
+	ts->idle_dur_idx = (idx + 1) & IDLE_DUR_MASK;
+}
+
 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 {
 	ktime_t delta;
@@ -760,6 +770,8 @@ static void tick_nohz_stop_idle(struct t
 
 	delta = ktime_sub(now, ts->idle_entrytime);
 
+	tick_nohz_update_idle_duration(ts, now);
+
 	write_seqcount_begin(&ts->idle_sleeptime_seq);
 	if (nr_iowait_cpu(smp_processor_id()) > 0)
 		ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
@@ -1224,7 +1236,7 @@ static bool can_stop_idle_tick(int cpu,
 			return false;
 	}
 
-	return true;
+	return ts->idle_dur_sum > TICK_NSEC * IDLE_DUR_ENTRIES;
 }
 
 /**
@@ -1292,6 +1304,7 @@ void tick_nohz_idle_enter(void)
 
 	tick_sched_flag_set(ts, TS_FLAG_INIDLE);
 	tick_nohz_start_idle(ts);
+	ts->idle_dur_entry = ts->idle_entrytime;
 
 	local_irq_enable();
 }
@@ -1490,11 +1503,12 @@ void tick_nohz_idle_exit(void)
 	idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
 	tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
 
-	if (idle_active || tick_stopped)
-		now = ktime_get();
+	now = ktime_get();
 
 	if (idle_active)
 		tick_nohz_stop_idle(ts, now);
+	else
+		tick_nohz_update_idle_duration(ts, now);
 
 	if (tick_stopped)
 		tick_nohz_idle_update_tick(ts, now);
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -30,6 +30,9 @@ struct tick_device {
 /* High resolution tick mode */
 #define TS_FLAG_HIGHRES		BIT(5)
 
+#define IDLE_DUR_ENTRIES	8
+#define IDLE_DUR_MASK		(IDLE_DUR_ENTRIES - 1)
+
 /**
  * struct tick_sched - sched tick emulation and no idle tick control/stats
  *
@@ -95,6 +98,12 @@ struct tick_sched {
 	ktime_t				idle_sleeptime;
 	ktime_t				iowait_sleeptime;
 
+	/* Idle duration */
+	ktime_t				idle_dur[IDLE_DUR_ENTRIES];
+	ktime_t				idle_dur_entry;
+	ktime_t				idle_dur_sum;
+	unsigned int			idle_dur_idx;
+
 	/* Full dynticks handling */
 	atomic_t			tick_dep_mask;
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-02-25 16:00 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-24  8:32 [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions Thomas Gleixner
2026-02-24  9:35 ` Christian Loehle
2026-02-24 16:13   ` Thomas Gleixner
2026-02-24 21:31     ` Rafael J. Wysocki
2026-02-24 21:55       ` Thomas Gleixner
2026-02-25 12:54         ` Rafael J. Wysocki
2026-02-25 13:10           ` Rafael J. Wysocki
2026-02-25 16:00             ` Thomas Gleixner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox