From: Thomas Gleixner <tglx@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
"Rafael J. Wysocki" <rafael@kernel.org>,
Frederic Weisbecker <frederic@kernel.org>
Subject: [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions
Date: Tue, 24 Feb 2026 09:32:50 +0100 [thread overview]
Message-ID: <875x7mv8wd.ffs@tglx> (raw)
During a hackbench run with a fully loaded machine CPUs go briefly idle
when they run out of tasks, which is expected. What's not expected are
pointless NOHZ transitions like this:
hackbench-1915 [001] d..2. 84.086755: sched_switch: prev_comm=hackbench prev_pid=1915 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
1) <idle>-0 [001] dn.2. 84.086757: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=305340000000 softexpires=305340000000 mode=ABS|PINNED|HARD was_armed=1
<idle>-0 [001] dn.2. 84.086757: hrtimer_rearm: next_event=83885523974 deferred=0
2) <idle>-0 [001] dN.2. 84.086761: hrtimer_start: hrtimer=00000000db1ede74 function=tick_nohz_handler expires=82950000000 softexpires=82950000000 mode=ABS|PINNED|HARD was_armed=1
<idle>-0 [001] dN.2. 84.086761: hrtimer_rearm: next_event=82950000000 deferred=0
<idle>-0 [001] d..2. 84.086767: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=hackbench next_pid=2138 next_prio=120
hackbench-2138 [001] d..2. 84.086779: sched_switch: prev_comm=hackbench prev_pid=2138 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
#1 switches to NOHZ mode targeting the next expiring timer and #2
switches back to tick mode a whopping 4us later.
This happens with both TEO and MENU governors in a VM guest. That's not
only pointless it's also a performance issue as each rearm of the timer
implies a VM exit.
Keep track of the idle time with a moving average and check it for being
larger than TICK_NSEC in can_stop_idle_tick(). That cures this behaviour
while still allowing the system to go into long idle sleeps once the
work load stopped.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
kernel/time/tick-sched.c | 20 +++++++++++++++++---
kernel/time/tick-sched.h | 9 +++++++++
2 files changed, 26 insertions(+), 3 deletions(-)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,6 +751,16 @@ static void tick_nohz_update_jiffies(kti
touch_softlockup_watchdog_sched();
}
+static void tick_nohz_update_idle_duration(struct tick_sched *ts, ktime_t now)
+{
+ ktime_t delta = now - ts->idle_dur_entry;
+ unsigned int idx = ts->idle_dur_idx;
+
+ ts->idle_dur_sum += delta - ts->idle_dur[idx];
+ ts->idle_dur[idx] = delta;
+ ts->idle_dur_idx = (idx + 1) & IDLE_DUR_MASK;
+}
+
static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
@@ -760,6 +770,8 @@ static void tick_nohz_stop_idle(struct t
delta = ktime_sub(now, ts->idle_entrytime);
+ tick_nohz_update_idle_duration(ts, now);
+
write_seqcount_begin(&ts->idle_sleeptime_seq);
if (nr_iowait_cpu(smp_processor_id()) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
@@ -1224,7 +1236,7 @@ static bool can_stop_idle_tick(int cpu,
return false;
}
- return true;
+ return ts->idle_dur_sum > TICK_NSEC * IDLE_DUR_ENTRIES;
}
/**
@@ -1292,6 +1304,7 @@ void tick_nohz_idle_enter(void)
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
tick_nohz_start_idle(ts);
+ ts->idle_dur_entry = ts->idle_entrytime;
local_irq_enable();
}
@@ -1490,11 +1503,12 @@ void tick_nohz_idle_exit(void)
idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
- if (idle_active || tick_stopped)
- now = ktime_get();
+ now = ktime_get();
if (idle_active)
tick_nohz_stop_idle(ts, now);
+ else
+ tick_nohz_update_idle_duration(ts, now);
if (tick_stopped)
tick_nohz_idle_update_tick(ts, now);
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -30,6 +30,9 @@ struct tick_device {
/* High resolution tick mode */
#define TS_FLAG_HIGHRES BIT(5)
+#define IDLE_DUR_ENTRIES 8
+#define IDLE_DUR_MASK (IDLE_DUR_ENTRIES - 1)
+
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
*
@@ -95,6 +98,12 @@ struct tick_sched {
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
+ /* Idle duration */
+ ktime_t idle_dur[IDLE_DUR_ENTRIES];
+ ktime_t idle_dur_entry;
+ ktime_t idle_dur_sum;
+ unsigned int idle_dur_idx;
+
/* Full dynticks handling */
atomic_t tick_dep_mask;
next reply other threads:[~2026-02-24 8:32 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-24 8:32 Thomas Gleixner [this message]
2026-02-24 9:35 ` [PATCH RFC] tick/sched: Prevent pointless NOHZ transitions Christian Loehle
2026-02-24 16:13 ` Thomas Gleixner
2026-02-24 21:31 ` Rafael J. Wysocki
2026-02-24 21:55 ` Thomas Gleixner
2026-02-25 12:54 ` Rafael J. Wysocki
2026-02-25 13:10 ` Rafael J. Wysocki
2026-02-25 16:00 ` Thomas Gleixner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=875x7mv8wd.ffs@tglx \
--to=tglx@kernel.org \
--cc=frederic@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=peterz@infradead.org \
--cc=rafael@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox