public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: tglx@linutronix.de
Cc: arnd@arndb.de, anna-maria@linutronix.de, frederic@kernel.org,
	peterz@infradead.org, luto@kernel.org, mingo@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	dietmar.eggemann@arm.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	linux-kernel@vger.kernel.org, oliver.sang@intel.com
Subject: [RFC][PATCH 7/8] entry,hrtimer: Push reprogramming timers into the interrupt return path
Date: Thu, 18 Sep 2025 09:52:26 +0200	[thread overview]
Message-ID: <20250918080206.180399724@infradead.org> (raw)
In-Reply-To: 20250918075219.091828500@infradead.org

Currently hrtimer_interrupt() runs expired timers, which can re-arm
themselves, after which it computes the next expiration time and
re-programs the hardware.

However, things like HRTICK, a highres timer driving preemption,
cannot re-arm itself at the point of running, since the next task has
not been determined yet. The schedule() in the interrupt return path
will switch to the next task, which then causes a new hrtimer to be
programmed.

This then results in reprogramming the hardware at least twice, once
after running the timers, and once upon selecting the new task.

Notably, *both* events happen in the interrupt.

By pushing the hrtimer reprogram all the way into the interrupt return
path, it runs after schedule() and this double reprogram can be
avoided.

XXX: 0-day is unhappy with this patch -- it is reporting lockups that
very much look like a timer goes missing. Am unable to reproduce.
Notable: the lockup goes away when the workloads are ran without perf
monitors.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/asm-generic/thread_info_tif.h |    5 ++++-
 include/linux/hrtimer.h               |   17 +++++++++++++++++
 kernel/entry/common.c                 |    7 +++++++
 kernel/sched/core.c                   |    6 ++++++
 kernel/time/hrtimer.c                 |   28 ++++++++++++++++++++++++----
 5 files changed, 58 insertions(+), 5 deletions(-)

--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -41,8 +41,11 @@
 #define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
 
 #ifdef HAVE_TIF_RESTORE_SIGMASK
-# define TIF_RESTORE_SIGMASK	10	// Restore signal mask in do_signal() */
+# define TIF_RESTORE_SIGMASK	10	// Restore signal mask in do_signal()
 # define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #endif
 
+#define TIF_HRTIMER_REARM              11       // re-arm the timer
+#define _TIF_HRTIMER_REARM             BIT(TIF_HRTIMER_REARM)
+
 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -175,10 +175,27 @@ extern void hrtimer_interrupt(struct clo
 
 extern unsigned int hrtimer_resolution;
 
+#ifdef TIF_HRTIMER_REARM
+extern void _hrtimer_rearm(void);
+/*
+ * This is to be called on all irqentry_exit() paths; as well as in the context
+ * switch path before switch_to().
+ */
+static inline void hrtimer_rearm(void)
+{
+	if (test_thread_flag(TIF_HRTIMER_REARM))
+		_hrtimer_rearm();
+}
+#else
+static inline void hrtimer_rearm(void) { }
+#endif /* TIF_HRTIMER_REARM */
+
 #else
 
 #define hrtimer_resolution	(unsigned int)LOW_RES_NSEC
 
+static inline void hrtimer_rearm(void) { }
+
 #endif
 
 static inline ktime_t
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -7,6 +7,7 @@
 #include <linux/kmsan.h>
 #include <linux/livepatch.h>
 #include <linux/tick.h>
+#include <linux/hrtimer.h>
 
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -71,6 +72,7 @@ noinstr void irqentry_exit_to_user_mode(
 {
 	instrumentation_begin();
 	exit_to_user_mode_prepare(regs);
+	hrtimer_rearm();
 	instrumentation_end();
 	exit_to_user_mode();
 }
@@ -183,6 +185,7 @@ noinstr void irqentry_exit(struct pt_reg
 		 */
 		if (state.exit_rcu) {
 			instrumentation_begin();
+			hrtimer_rearm();
 			/* Tell the tracer that IRET will enable interrupts */
 			trace_hardirqs_on_prepare();
 			lockdep_hardirqs_on_prepare();
@@ -196,10 +199,14 @@ noinstr void irqentry_exit(struct pt_reg
 		if (IS_ENABLED(CONFIG_PREEMPTION))
 			irqentry_exit_cond_resched();
 
+		hrtimer_rearm();
 		/* Covers both tracing and lockdep */
 		trace_hardirqs_on();
 		instrumentation_end();
 	} else {
+		instrumentation_begin();
+		hrtimer_rearm();
+		instrumentation_end();
 		/*
 		 * IRQ flags state is correct already. Just tell RCU if it
 		 * was not watching on entry.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5161,6 +5161,12 @@ prepare_task_switch(struct rq *rq, struc
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
+	/*
+	 * Notably, this must be called after pick_next_task() but before
+	 * switch_to(), since the new task need not be on the return from
+	 * interrupt path.
+	 */
+	hrtimer_rearm();
 	prepare_arch_switch(next);
 }
 
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1892,10 +1892,9 @@ static __latent_entropy void hrtimer_run
  * Very similar to hrtimer_force_reprogram(), except it deals with
  * in_hrirq and hang_detected.
  */
-static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base,
+			    ktime_t now, ktime_t expires_next)
 {
-	ktime_t expires_next = hrtimer_update_next_event(cpu_base);
-
 	cpu_base->expires_next = expires_next;
 	cpu_base->in_hrtirq = 0;
 
@@ -1970,9 +1969,30 @@ void hrtimer_interrupt(struct clock_even
 		cpu_base->hang_detected = 1;
 	}
 
-	__hrtimer_rearm(cpu_base, now);
+#ifdef TIF_HRTIMER_REARM
+	set_thread_flag(TIF_HRTIMER_REARM);
+#else
+	__hrtimer_rearm(cpu_base, now, expires_next);
+#endif
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 }
+
+#ifdef TIF_HRTIMER_REARM
+void _hrtimer_rearm(void)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t now, expires_next;
+
+	lockdep_assert_irqs_disabled();
+
+	scoped_guard (raw_spinlock, &cpu_base->lock) {
+		now = hrtimer_update_base(cpu_base);
+		expires_next = hrtimer_update_next_event(cpu_base);
+		__hrtimer_rearm(cpu_base, now, expires_next);
+		clear_thread_flag(TIF_HRTIMER_REARM);
+	}
+}
+#endif /* TIF_HRTIMER_REARM */
 #endif /* !CONFIG_HIGH_RES_TIMERS */
 
 /*



  parent reply	other threads:[~2025-09-18  8:06 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-18  7:52 [PATCH 0/8] hrtimer/sched: Improve hrtick Peter Zijlstra
2025-09-18  7:52 ` [PATCH 1/8] sched: Fix hrtick() vs scheduling context Peter Zijlstra
2025-09-19  3:53   ` K Prateek Nayak
2025-09-23  0:24   ` John Stultz
2025-12-03 18:25   ` [tip: sched/urgent] sched/hrtick: Fix hrtick() vs. " tip-bot2 for Peter Zijlstra
2025-12-03 18:31   ` tip-bot2 for Peter Zijlstra
2025-12-06  9:10   ` tip-bot2 for Peter Zijlstra
2025-09-18  7:52 ` [PATCH 2/8] sched/fair: Limit hrtick work Peter Zijlstra
2025-09-19 14:59   ` K Prateek Nayak
2025-11-28  8:25     ` Peter Zijlstra
2025-12-14  7:46   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-09-18  7:52 ` [PATCH 3/8] sched/eevdf: Fix HRTICK duration Peter Zijlstra
2025-09-19 15:34   ` K Prateek Nayak
2025-11-28  8:32     ` Peter Zijlstra
2025-09-18  7:52 ` [PATCH 4/8] hrtimer: Optimize __hrtimer_start_range_ns() Peter Zijlstra
2025-09-18  7:52 ` [PATCH 5/8] hrtimer,sched: Add fuzzy hrtimer mode for HRTICK Peter Zijlstra
2025-09-18  7:52 ` [PATCH 6/8] hrtimer: Re-arrange hrtimer_interrupt() Peter Zijlstra
2025-09-18  7:52 ` Peter Zijlstra [this message]
2025-09-20  9:29   ` [RFC][PATCH 7/8] entry,hrtimer: Push reprogramming timers into the interrupt return path Thomas Gleixner
2025-09-23  7:52     ` Peter Zijlstra
2025-09-23  8:18       ` Peter Zijlstra
2025-09-18  7:52 ` [RFC][PATCH 8/8] sched: Default enable HRTICK Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250918080206.180399724@infradead.org \
    --to=peterz@infradead.org \
    --cc=anna-maria@linutronix.de \
    --cc=arnd@arndb.de \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=frederic@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=oliver.sang@intel.com \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox