public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Thomas Gleixner <tglx@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Frederic Weisbecker <frederic@kernel.org>
Subject: [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions
Date: Tue, 24 Feb 2026 09:32:50 +0100	[thread overview]
Message-ID: <875x7mv8wd.ffs@tglx> (raw)

During a hackbench run with a fully loaded machine CPUs go briefly idle
when they run out of tasks, which is expected. What's not expected are
pointless NOHZ transitions like this:

       hackbench-1915    [001] d..2.    84.086755: sched_switch: prev_comm=hackbench prev_pid=1915 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
 1)       <idle>-0       [001] dn.2.    84.086757: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=305340000000 softexpires=305340000000 mode=ABS|PINNED|HARD was_armed=1
          <idle>-0       [001] dn.2.    84.086757: hrtimer_rearm: next_event=83885523974 deferred=0
 2)       <idle>-0       [001] dN.2.    84.086761: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=82950000000 softexpires=82950000000 mode=ABS|PINNED|HARD was_armed=1
          <idle>-0       [001] dN.2.    84.086761: hrtimer_rearm: next_event=82950000000 deferred=0
          <idle>-0       [001] d..2.    84.086767: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=hackbench next_pid=2138 next_prio=120
       hackbench-2138    [001] d..2.    84.086779: sched_switch: prev_comm=hackbench prev_pid=2138 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120

#1 switches to NOHZ mode targeting the next expiring timer and #2
switches back to tick mode a whopping 4us later.

This happens with both TEO and MENU governors in a VM guest. That's not
only pointless it's also a performance issue as each rearm of the timer
implies a VM exit.

Keep track of the idle time with a moving average and check it for being
larger than TICK_NSEC in can_stop_idle_tick(). That cures this behaviour
while still allowing the system to go into long idle sleeps once the
work load stopped.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
 kernel/time/tick-sched.c |   20 +++++++++++++++++---
 kernel/time/tick-sched.h |    9 +++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,6 +751,16 @@ static void tick_nohz_update_jiffies(kti
 	touch_softlockup_watchdog_sched();
 }
 
+static void tick_nohz_update_idle_duration(struct tick_sched *ts, ktime_t now)
+{
+	ktime_t delta = now - ts->idle_dur_entry;
+	unsigned int idx = ts->idle_dur_idx;
+
+	ts->idle_dur_sum += delta - ts->idle_dur[idx];
+	ts->idle_dur[idx] = delta;
+	ts->idle_dur_idx = (idx + 1) & IDLE_DUR_MASK;
+}
+
 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 {
 	ktime_t delta;
@@ -760,6 +770,8 @@ static void tick_nohz_stop_idle(struct t
 
 	delta = ktime_sub(now, ts->idle_entrytime);
 
+	tick_nohz_update_idle_duration(ts, now);
+
 	write_seqcount_begin(&ts->idle_sleeptime_seq);
 	if (nr_iowait_cpu(smp_processor_id()) > 0)
 		ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
@@ -1224,7 +1236,7 @@ static bool can_stop_idle_tick(int cpu,
 			return false;
 	}
 
-	return true;
+	return ts->idle_dur_sum > TICK_NSEC * IDLE_DUR_ENTRIES;
 }
 
 /**
@@ -1292,6 +1304,7 @@ void tick_nohz_idle_enter(void)
 
 	tick_sched_flag_set(ts, TS_FLAG_INIDLE);
 	tick_nohz_start_idle(ts);
+	ts->idle_dur_entry = ts->idle_entrytime;
 
 	local_irq_enable();
 }
@@ -1490,11 +1503,12 @@ void tick_nohz_idle_exit(void)
 	idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
 	tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
 
-	if (idle_active || tick_stopped)
-		now = ktime_get();
+	now = ktime_get();
 
 	if (idle_active)
 		tick_nohz_stop_idle(ts, now);
+	else
+		tick_nohz_update_idle_duration(ts, now);
 
 	if (tick_stopped)
 		tick_nohz_idle_update_tick(ts, now);
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -30,6 +30,9 @@ struct tick_device {
 /* High resolution tick mode */
 #define TS_FLAG_HIGHRES		BIT(5)
 
+#define IDLE_DUR_ENTRIES	8
+#define IDLE_DUR_MASK		(IDLE_DUR_ENTRIES - 1)
+
 /**
  * struct tick_sched - sched tick emulation and no idle tick control/stats
  *
@@ -95,6 +98,12 @@ struct tick_sched {
 	ktime_t				idle_sleeptime;
 	ktime_t				iowait_sleeptime;
 
+	/* Idle duration */
+	ktime_t				idle_dur[IDLE_DUR_ENTRIES];
+	ktime_t				idle_dur_entry;
+	ktime_t				idle_dur_sum;
+	unsigned int			idle_dur_idx;
+
 	/* Full dynticks handling */
 	atomic_t			tick_dep_mask;
 

             reply	other threads:[~2026-02-24  8:32 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-24  8:32 Thomas Gleixner [this message]
2026-02-24  9:35 ` [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions Christian Loehle
2026-02-24 16:13   ` Thomas Gleixner
2026-02-24 21:31     ` Rafael J. Wysocki
2026-02-24 21:55       ` Thomas Gleixner
2026-02-25 12:54         ` Rafael J. Wysocki
2026-02-25 13:10           ` Rafael J. Wysocki
2026-02-25 16:00             ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=875x7mv8wd.ffs@tglx \
    --to=tglx@kernel.org \
    --cc=frederic@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=rafael@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox