All of lore.kernel.org
 help / color / mirror / Atom feed
From: Thomas Gleixner <tglx@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Frederic Weisbecker <frederic@kernel.org>
Subject: [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions
Date: Tue, 24 Feb 2026 09:32:50 +0100	[thread overview]
Message-ID: <875x7mv8wd.ffs@tglx> (raw)

During a hackbench run with a fully loaded machine CPUs go briefly idle
when they run out of tasks, which is expected. What's not expected are
pointless NOHZ transitions like this:

       hackbench-1915    [001] d..2.    84.086755: sched_switch: prev_comm=hackbench prev_pid=1915 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
 1)       <idle>-0       [001] dn.2.    84.086757: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=305340000000 softexpires=305340000000 mode=ABS|PINNED|HARD was_armed=1
          <idle>-0       [001] dn.2.    84.086757: hrtimer_rearm: next_event=83885523974 deferred=0
 2)       <idle>-0       [001] dN.2.    84.086761: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=82950000000 softexpires=82950000000 mode=ABS|PINNED|HARD was_armed=1
          <idle>-0       [001] dN.2.    84.086761: hrtimer_rearm: next_event=82950000000 deferred=0
          <idle>-0       [001] d..2.    84.086767: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=hackbench next_pid=2138 next_prio=120
       hackbench-2138    [001] d..2.    84.086779: sched_switch: prev_comm=hackbench prev_pid=2138 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120

#1 switches to NOHZ mode targeting the next expiring timer and #2
switches back to tick mode a whopping 4us later.

This happens with both TEO and MENU governors in a VM guest. That's not
only pointless it's also a performance issue as each rearm of the timer
implies a VM exit.

Keep track of the idle time with a moving average and check it for being
larger than TICK_NSEC in can_stop_idle_tick(). That cures this behaviour
while still allowing the system to go into long idle sleeps once the
work load stopped.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
 kernel/time/tick-sched.c |   20 +++++++++++++++++---
 kernel/time/tick-sched.h |    9 +++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,6 +751,16 @@ static void tick_nohz_update_jiffies(kti
 	touch_softlockup_watchdog_sched();
 }
 
+static void tick_nohz_update_idle_duration(struct tick_sched *ts, ktime_t now)
+{
+	ktime_t delta = now - ts->idle_dur_entry;
+	unsigned int idx = ts->idle_dur_idx;
+
+	ts->idle_dur_sum += delta - ts->idle_dur[idx];
+	ts->idle_dur[idx] = delta;
+	ts->idle_dur_idx = (idx + 1) & IDLE_DUR_MASK;
+}
+
 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 {
 	ktime_t delta;
@@ -760,6 +770,8 @@ static void tick_nohz_stop_idle(struct t
 
 	delta = ktime_sub(now, ts->idle_entrytime);
 
+	tick_nohz_update_idle_duration(ts, now);
+
 	write_seqcount_begin(&ts->idle_sleeptime_seq);
 	if (nr_iowait_cpu(smp_processor_id()) > 0)
 		ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
@@ -1224,7 +1236,7 @@ static bool can_stop_idle_tick(int cpu,
 			return false;
 	}
 
-	return true;
+	return ts->idle_dur_sum > TICK_NSEC * IDLE_DUR_ENTRIES;
 }
 
 /**
@@ -1292,6 +1304,7 @@ void tick_nohz_idle_enter(void)
 
 	tick_sched_flag_set(ts, TS_FLAG_INIDLE);
 	tick_nohz_start_idle(ts);
+	ts->idle_dur_entry = ts->idle_entrytime;
 
 	local_irq_enable();
 }
@@ -1490,11 +1503,12 @@ void tick_nohz_idle_exit(void)
 	idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
 	tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
 
-	if (idle_active || tick_stopped)
-		now = ktime_get();
+	now = ktime_get();
 
 	if (idle_active)
 		tick_nohz_stop_idle(ts, now);
+	else
+		tick_nohz_update_idle_duration(ts, now);
 
 	if (tick_stopped)
 		tick_nohz_idle_update_tick(ts, now);
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -30,6 +30,9 @@ struct tick_device {
 /* High resolution tick mode */
 #define TS_FLAG_HIGHRES		BIT(5)
 
+#define IDLE_DUR_ENTRIES	8
+#define IDLE_DUR_MASK		(IDLE_DUR_ENTRIES - 1)
+
 /**
  * struct tick_sched - sched tick emulation and no idle tick control/stats
  *
@@ -95,6 +98,12 @@ struct tick_sched {
 	ktime_t				idle_sleeptime;
 	ktime_t				iowait_sleeptime;
 
+	/* Idle duration */
+	ktime_t				idle_dur[IDLE_DUR_ENTRIES];
+	ktime_t				idle_dur_entry;
+	ktime_t				idle_dur_sum;
+	unsigned int			idle_dur_idx;
+
 	/* Full dynticks handling */
 	atomic_t			tick_dep_mask;
 

             reply	other threads:[~2026-02-24  8:32 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-24  8:32 Thomas Gleixner [this message]
2026-02-24  9:35 ` [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions Christian Loehle
2026-02-24 16:13   ` Thomas Gleixner
2026-02-24 21:31     ` Rafael J. Wysocki
2026-02-24 21:55       ` Thomas Gleixner
2026-02-25 12:54         ` Rafael J. Wysocki
2026-02-25 13:10           ` Rafael J. Wysocki
2026-02-25 16:00             ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=875x7mv8wd.ffs@tglx \
    --to=tglx@kernel.org \
    --cc=frederic@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=rafael@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.