public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] sched: Lazy preemption muck
@ 2024-10-07  7:46 Peter Zijlstra
  2024-10-07  7:46 ` [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Peter Zijlstra
                   ` (9 more replies)
  0 siblings, 10 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-07  7:46 UTC (permalink / raw)
  To: bigeasy, tglx, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

Hi!

During LPC Thomas reminded me that the lazy preemption stuff was not there yet.

So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
build boots and can change the mode.

Please have a poke.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
@ 2024-10-07  7:46 ` Peter Zijlstra
  2024-10-09 12:18   ` Sebastian Andrzej Siewior
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-07  7:46 UTC (permalink / raw)
  To: bigeasy, tglx, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

Add the basic infrastructure to split the TIF_NEED_RESCHED bit in two.
Either bit will cause a resched on return-to-user, but only
TIF_NEED_RESCHED will drive IRQ preemption.

No behavioural change intended.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/entry-common.h |    3 ++-
 include/linux/entry-kvm.h    |    5 +++--
 include/linux/sched.h        |    3 ++-
 include/linux/thread_info.h  |   21 +++++++++++++++++----
 kernel/entry/common.c        |    2 +-
 kernel/entry/kvm.c           |    4 ++--
 kernel/sched/core.c          |   34 +++++++++++++++++++++-------------
 7 files changed, 48 insertions(+), 24 deletions(-)

--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -64,7 +64,8 @@
 
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
-	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
+	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -17,8 +17,9 @@
 #endif
 
 #define XFER_TO_GUEST_MODE_WORK						\
-	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL |	\
-	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
+	 _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME |			\
+	 ARCH_XFER_TO_GUEST_MODE_WORK)
 
 struct kvm_vcpu;
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(
 
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
 {
-	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+	atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+			   (atomic_long_t *)&task_thread_info(tsk)->flags);
 }
 
 static inline int test_tsk_need_resched(struct task_struct *tsk)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@ enum syscall_work_bit {
 
 #include <asm/thread_info.h>
 
+#ifndef TIF_NEED_RESCHED_LAZY
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+#error Inconsistent PREEMPT_LAZY
+#endif
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
@@ -179,22 +187,27 @@ static __always_inline unsigned long rea
 
 #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
 
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
 {
-	return arch_test_bit(TIF_NEED_RESCHED,
+	return arch_test_bit(bit,
 			     (unsigned long *)(&current_thread_info()->flags));
 }
 
 #else
 
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
 {
-	return test_bit(TIF_NEED_RESCHED,
+	return test_bit(bit,
 			(unsigned long *)(&current_thread_info()->flags));
 }
 
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
 
+static __always_inline bool tif_need_resched(void)
+{
+	return tif_test_bit(TIF_NEED_RESCHED);
+}
+
 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
 static inline int arch_within_stack_frames(const void * const stack,
 					   const void * const stackend,
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_us
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
 			schedule();
 
 		if (ti_work & _TIF_UPROBE)
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struc
 			return -EINTR;
 		}
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
 			schedule();
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struc
 			return ret;
 
 		ti_work = read_thread_flags();
-	} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+	} while (ti_work & XFER_TO_GUEST_MODE_WORK);
 	return 0;
 }
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -936,10 +936,9 @@ static inline void hrtick_rq_init(struct
  * this avoids any races wrt polling state changes and thereby avoids
  * spurious IPIs.
  */
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 {
-	struct thread_info *ti = task_thread_info(p);
-	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+	return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
 }
 
 /*
@@ -964,9 +963,9 @@ static bool set_nr_if_polling(struct tas
 }
 
 #else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 {
-	set_tsk_need_resched(p);
+	atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);
 	return true;
 }
 
@@ -1071,28 +1070,37 @@ void wake_up_q(struct wake_q_head *head)
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
 {
 	struct task_struct *curr = rq->curr;
+	struct thread_info *cti = task_thread_info(curr);
 	int cpu;
 
 	lockdep_assert_rq_held(rq);
 
-	if (test_tsk_need_resched(curr))
+	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
 	cpu = cpu_of(rq);
 
 	if (cpu == smp_processor_id()) {
-		set_tsk_need_resched(curr);
-		set_preempt_need_resched();
+		set_ti_thread_flag(cti, tif);
+		if (tif == TIF_NEED_RESCHED)
+			set_preempt_need_resched();
 		return;
 	}
 
-	if (set_nr_and_not_polling(curr))
-		smp_send_reschedule(cpu);
-	else
+	if (set_nr_and_not_polling(cti, tif)) {
+		if (tif == TIF_NEED_RESCHED)
+			smp_send_reschedule(cpu);
+	} else {
 		trace_sched_wake_idle_without_ipi(cpu);
+	}
+}
+
+void resched_curr(struct rq *rq)
+{
+	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
 void resched_cpu(int cpu)
@@ -1187,7 +1195,7 @@ static void wake_up_idle_cpu(int cpu)
 	 * and testing of the above solutions didn't appear to report
 	 * much benefits.
 	 */
-	if (set_nr_and_not_polling(rq->idle))
+	if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
 		smp_send_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
  2024-10-07  7:46 ` [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Peter Zijlstra
@ 2024-10-07  7:46 ` Peter Zijlstra
  2024-10-08  5:43   ` Ankur Arora
                     ` (4 more replies)
  2024-10-07  7:46 ` [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT Peter Zijlstra
                   ` (7 subsequent siblings)
  9 siblings, 5 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-07  7:46 UTC (permalink / raw)
  To: bigeasy, tglx, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

Change fair to use resched_curr_lazy(), which, when the lazy
preemption model is selected, will set TIF_NEED_RESCHED_LAZY.

This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
As such, the average delay between setting LAZY and actually
rescheduling will be TICK_NSEC/2.

In short, Lazy preemption will delay preemption for fair class but
will function as Full preemption for all the other classes, most
notably the realtime (RR/FIFO/DEADLINE) classes.

The goal is to bridge the performance gap with Voluntary, such that we
might eventually remove that option entirely.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |    8 ++++-
 kernel/Kconfig.preempt  |   15 +++++++++
 kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/debug.c    |    5 +--
 kernel/sched/fair.c     |    6 +--
 kernel/sched/sched.h    |    1 
 6 files changed, 103 insertions(+), 8 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
 extern bool preempt_model_none(void);
 extern bool preempt_model_voluntary(void);
 extern bool preempt_model_full(void);
+extern bool preempt_model_lazy(void);
 
 #else
 
@@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
 	return IS_ENABLED(CONFIG_PREEMPT);
 }
 
+static inline bool preempt_model_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+
 #endif
 
 static inline bool preempt_model_rt(void)
@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
  */
 static inline bool preempt_model_preemptible(void)
 {
-	return preempt_model_full() || preempt_model_rt();
+	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
 }
 
 #endif /* __LINUX_PREEMPT_H */
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
 	select PREEMPTION
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 
+config ARCH_HAS_PREEMPT_LAZY
+	bool
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
@@ -67,6 +70,18 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_LAZY
+	bool "Scheduler controlled preemption model"
+	depends on !ARCH_NO_PREEMPT
+	depends on ARCH_HAS_PREEMPT_LAZY
+	select PREEMPT_BUILD
+	help
+	  This option provides a scheduler driven preemption model that
+	  is fundamentally similar to full preemption, but is less
+	  eager to preempt SCHED_NORMAL tasks in an attempt to
+	  reduce lock holder preemption and recover some of the performance
+	  gains seen from using Voluntary preemption.
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
 
 	lockdep_assert_rq_held(rq);
 
+	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+		tif = TIF_NEED_RESCHED;
+
 	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
@@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
 	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return IS_ENABLED(PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int tif_need_resched_lazy(void)
+{
+	if (dynamic_preempt_lazy())
+		return TIF_NEED_RESCHED_LAZY;
+
+	return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	__resched_curr(rq, tif_need_resched_lazy());
+}
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -5598,6 +5627,10 @@ void sched_tick(void)
 	update_rq_clock(rq);
 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+		resched_curr(rq);
+
 	curr->sched_class->task_tick(rq, curr, 0);
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
@@ -7334,6 +7367,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * VOLUNTARY:
  *   cond_resched               <- __cond_resched
@@ -7341,6 +7375,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * FULL:
  *   cond_resched               <- RET0
@@ -7348,6 +7383,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule           <- preempt_schedule
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- false
+ *
+ * LAZY:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- true
  */
 
 enum {
@@ -7355,6 +7399,7 @@ enum {
 	preempt_dynamic_none,
 	preempt_dynamic_voluntary,
 	preempt_dynamic_full,
+	preempt_dynamic_lazy,
 };
 
 int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7370,15 +7415,23 @@ int sched_dynamic_mode(const char *str)
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
 
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+	if (!strcmp(str, "lazy"))
+		return preempt_dynamic_lazy;
+#endif
+
 	return -EINVAL;
 }
 
+#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
 #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
 #else
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
@@ -7398,6 +7451,7 @@ static void __sched_dynamic_update(int m
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
 	preempt_dynamic_enable(irqentry_exit_cond_resched);
+	preempt_dynamic_key_disable(preempt_lazy);
 
 	switch (mode) {
 	case preempt_dynamic_none:
@@ -7407,6 +7461,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: none\n");
 		break;
@@ -7418,6 +7473,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: voluntary\n");
 		break;
@@ -7429,9 +7485,22 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: full\n");
 		break;
+
+	case preempt_dynamic_lazy:
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_enable(preempt_lazy);
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: lazy\n");
+		break;
 	}
 
 	preempt_dynamic_mode = mode;
@@ -7494,6 +7563,8 @@ static void __init preempt_dynamic_init(
 			sched_dynamic_update(preempt_dynamic_none);
 		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
 			sched_dynamic_update(preempt_dynamic_voluntary);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+			sched_dynamic_update(preempt_dynamic_lazy);
 		} else {
 			/* Default static call setting, nothing to do */
 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7514,6 +7585,7 @@ static void __init preempt_dynamic_init(
 PREEMPT_MODEL_ACCESSOR(none);
 PREEMPT_MODEL_ACCESSOR(voluntary);
 PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
 
 #else /* !CONFIG_PREEMPT_DYNAMIC: */
 
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struc
 static int sched_dynamic_show(struct seq_file *m, void *v)
 {
 	static const char * preempt_modes[] = {
-		"none", "voluntary", "full"
+		"none", "voluntary", "full", "lazy",
 	};
+	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+	for (i = 0; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
 		return;
 
 	if (resched || did_preempt_short(cfs_rq, curr)) {
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 		clear_buddies(cfs_rq, curr);
 	}
 }
@@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 	/*
@@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 }
 
 static struct task_struct *pick_task_fair(struct rq *rq)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2692,6 +2692,7 @@ extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
 extern void resched_cpu(int cpu);
 
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
  2024-10-07  7:46 ` [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Peter Zijlstra
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
@ 2024-10-07  7:46 ` Peter Zijlstra
  2024-10-08 13:24   ` Sebastian Andrzej Siewior
                     ` (2 more replies)
  2024-10-07  7:46 ` [PATCH 4/5] sched, x86: Enable Lazy preemption Peter Zijlstra
                   ` (6 subsequent siblings)
  9 siblings, 3 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-07  7:46 UTC (permalink / raw)
  To: bigeasy, tglx, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT
from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is
not a change in how preemption works, but rather it makes a ton more
code preemptible.

Notably, take away NONE and VOLATILE options for PREEMPT_RT, they make
no sense (but are techincally possible).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/Kconfig.preempt |   12 +++++++-----
 kernel/sched/core.c    |    2 ++
 kernel/sched/debug.c   |    4 ++--
 3 files changed, 11 insertions(+), 7 deletions(-)

--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -20,6 +20,7 @@ choice
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
+	depends on !PREEMPT_RT
 	select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This is the traditional Linux preemption model, geared towards
@@ -35,6 +36,7 @@ config PREEMPT_NONE
 config PREEMPT_VOLUNTARY
 	bool "Voluntary Kernel Preemption (Desktop)"
 	depends on !ARCH_NO_PREEMPT
+	depends on !PREEMPT_RT
 	select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This option reduces the latency of the kernel by adding more
@@ -54,7 +56,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	depends on !ARCH_NO_PREEMPT
-	select PREEMPT_BUILD
+	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
@@ -74,7 +76,7 @@ config PREEMPT_LAZY
 	bool "Scheduler controlled preemption model"
 	depends on !ARCH_NO_PREEMPT
 	depends on ARCH_HAS_PREEMPT_LAZY
-	select PREEMPT_BUILD
+	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This option provides a scheduler driven preemption model that
 	  is fundamentally similar to full preemption, but is less
@@ -82,6 +84,8 @@ config PREEMPT_LAZY
 	  reduce lock holder preemption and recover some of the performance
 	  gains seen from using Voluntary preemption.
 
+endchoice
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
@@ -99,8 +103,6 @@ config PREEMPT_RT
 	  Select this if you are building a kernel for systems which
 	  require real-time guarantees.
 
-endchoice
-
 config PREEMPT_COUNT
        bool
 
@@ -110,7 +112,7 @@ config PREEMPTION
 
 config PREEMPT_DYNAMIC
 	bool "Preemption behaviour defined on boot"
-	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+	depends on HAVE_PREEMPT_DYNAMIC
 	select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
 	select PREEMPT_BUILD
 	default y if HAVE_PREEMPT_DYNAMIC_CALL
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7406,11 +7406,13 @@ int preempt_dynamic_mode = preempt_dynam
 
 int sched_dynamic_mode(const char *str)
 {
+#ifndef CONFIG_PREEMPT_RT
 	if (!strcmp(str, "none"))
 		return preempt_dynamic_none;
 
 	if (!strcmp(str, "voluntary"))
 		return preempt_dynamic_voluntary;
+#endif
 
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -248,9 +248,9 @@ static int sched_dynamic_show(struct seq
 		"none", "voluntary", "full", "lazy",
 	};
 	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
-	int i;
+	int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
 
-	for (i = 0; i < j; i++) {
+	for (; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 4/5] sched, x86: Enable Lazy preemption
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (2 preceding siblings ...)
  2024-10-07  7:46 ` [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT Peter Zijlstra
@ 2024-10-07  7:46 ` Peter Zijlstra
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2024-10-07  7:46 ` [PATCH 5/5] sched: Add laziest preempt model Peter Zijlstra
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-07  7:46 UTC (permalink / raw)
  To: bigeasy, tglx, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

Add the TIF bit and select the Kconfig symbol to make it go.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/Kconfig                   |    1 +
 arch/x86/include/asm/thread_info.h |    6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PREEMPT_LAZY
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,8 +87,9 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
-#define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
-#define TIF_SSBD		5	/* Speculative store bypass disable */
+#define TIF_NEED_RESCHED_LAZY	4	/* rescheduling necessary */
+#define TIF_SINGLESTEP		5	/* reenable singlestep on user return*/
+#define TIF_SSBD		6	/* Speculative store bypass disable */
 #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
 #define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
@@ -110,6 +111,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_SSBD		(1 << TIF_SSBD)
 #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 5/5] sched: Add laziest preempt model
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (3 preceding siblings ...)
  2024-10-07  7:46 ` [PATCH 4/5] sched, x86: Enable Lazy preemption Peter Zijlstra
@ 2024-10-07  7:46 ` Peter Zijlstra
  2024-10-08  5:59   ` Ankur Arora
                     ` (2 more replies)
  2024-10-07  8:33 ` [PATCH 0/5] sched: Lazy preemption muck Sebastian Andrzej Siewior
                   ` (4 subsequent siblings)
  9 siblings, 3 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-07  7:46 UTC (permalink / raw)
  To: bigeasy, tglx, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

Much like LAZY, except lazier still. It will not promote LAZY to full
preempt on tick and compete with None for suckage.

(do we really wants this?)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   10 ++++++++-
 kernel/Kconfig.preempt  |   12 +++++++++++
 kernel/sched/core.c     |   49 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c    |    4 +--
 4 files changed, 71 insertions(+), 4 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -487,6 +487,7 @@ extern bool preempt_model_none(void);
 extern bool preempt_model_voluntary(void);
 extern bool preempt_model_full(void);
 extern bool preempt_model_lazy(void);
+extern bool preempt_model_laziest(void);
 
 #else
 
@@ -507,6 +508,10 @@ static inline bool preempt_model_lazy(vo
 {
 	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
 }
+static inline bool preempt_model_laziest(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZIEST);
+}
 
 #endif
 
@@ -525,7 +530,10 @@ static inline bool preempt_model_rt(void
  */
 static inline bool preempt_model_preemptible(void)
 {
-	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
+	return preempt_model_full() ||
+	       preempt_model_lazy() ||
+	       preempt_model_laziest() ||
+	       preempt_model_rt();
 }
 
 #endif /* __LINUX_PREEMPT_H */
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -84,6 +84,18 @@ config PREEMPT_LAZY
 	  reduce lock holder preemption and recover some of the performance
 	  gains seen from using Voluntary preemption.
 
+config PREEMPT_LAZIEST
+	bool "Scheduler controlled preemption model"
+	depends on !ARCH_NO_PREEMPT
+	depends on ARCH_HAS_PREEMPT_LAZY
+	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
+	help
+	  This option provides a scheduler driven preemption model that
+	  is fundamentally similar to full preemption, but is least
+	  eager to preempt SCHED_NORMAL tasks in an attempt to
+	  reduce lock holder preemption and recover some of the performance
+	  gains seen from using no preemption.
+
 endchoice
 
 config PREEMPT_RT
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1108,13 +1108,22 @@ void resched_curr(struct rq *rq)
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_promote);
 static __always_inline bool dynamic_preempt_lazy(void)
 {
 	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
 }
+static __always_inline bool dynamic_preempt_promote(void)
+{
+	return static_branch_unlikely(&sk_dynamic_preempt_promote);
+}
 #else
 static __always_inline bool dynamic_preempt_lazy(void)
 {
+	return IS_ENABLED(PREEMPT_LAZY) | IS_ENABLED(PREEMPT_LAZIEST);
+}
+static __always_inline bool dynamic_preempt_promote(void)
+{
 	return IS_ENABLED(PREEMPT_LAZY);
 }
 #endif
@@ -5628,7 +5637,7 @@ void sched_tick(void)
 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
 
-	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+	if (dynamic_preempt_promote() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
 		resched_curr(rq);
 
 	curr->sched_class->task_tick(rq, curr, 0);
@@ -7368,6 +7377,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
  *   dynamic_preempt_lazy       <- false
+ *   dynamic_preempt_promote    <- false
  *
  * VOLUNTARY:
  *   cond_resched               <- __cond_resched
@@ -7376,6 +7386,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
  *   dynamic_preempt_lazy       <- false
+ *   dynamic_preempt_promote    <- false
  *
  * FULL:
  *   cond_resched               <- RET0
@@ -7384,6 +7395,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
  *   dynamic_preempt_lazy       <- false
+ *   dynamic_preempt_promote    <- false
  *
  * LAZY:
  *   cond_resched               <- RET0
@@ -7392,6 +7404,16 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
  *   dynamic_preempt_lazy       <- true
+ *   dynamic_preempt_promote    <- true
+ *
+ * LAZIEST:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- true
+ *   dynamic_preempt_promote    <- false
  */
 
 enum {
@@ -7400,6 +7422,7 @@ enum {
 	preempt_dynamic_voluntary,
 	preempt_dynamic_full,
 	preempt_dynamic_lazy,
+	preempt_dynamic_laziest,
 };
 
 int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7420,6 +7443,9 @@ int sched_dynamic_mode(const char *str)
 #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
 	if (!strcmp(str, "lazy"))
 		return preempt_dynamic_lazy;
+
+	if (!strcmp(str, "laziest"))
+		return preempt_dynamic_laziest;
 #endif
 
 	return -EINVAL;
@@ -7454,6 +7480,7 @@ static void __sched_dynamic_update(int m
 	preempt_dynamic_enable(preempt_schedule_notrace);
 	preempt_dynamic_enable(irqentry_exit_cond_resched);
 	preempt_dynamic_key_disable(preempt_lazy);
+	preempt_dynamic_key_disable(preempt_promote);
 
 	switch (mode) {
 	case preempt_dynamic_none:
@@ -7464,6 +7491,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
 		preempt_dynamic_key_disable(preempt_lazy);
+		preempt_dynamic_key_disable(preempt_promote);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: none\n");
 		break;
@@ -7476,6 +7504,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
 		preempt_dynamic_key_disable(preempt_lazy);
+		preempt_dynamic_key_disable(preempt_promote);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: voluntary\n");
 		break;
@@ -7488,6 +7517,7 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
 		preempt_dynamic_key_disable(preempt_lazy);
+		preempt_dynamic_key_disable(preempt_promote);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: full\n");
 		break;
@@ -7500,9 +7530,23 @@ static void __sched_dynamic_update(int m
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
 		preempt_dynamic_key_enable(preempt_lazy);
+		preempt_dynamic_key_enable(preempt_promote);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: lazy\n");
 		break;
+
+	case preempt_dynamic_laziest:
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_enable(preempt_lazy);
+		preempt_dynamic_key_disable(preempt_promote);
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: laziest\n");
+		break;
 	}
 
 	preempt_dynamic_mode = mode;
@@ -7567,6 +7611,8 @@ static void __init preempt_dynamic_init(
 			sched_dynamic_update(preempt_dynamic_voluntary);
 		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
 			sched_dynamic_update(preempt_dynamic_lazy);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZIEST)) {
+			sched_dynamic_update(preempt_dynamic_laziest);
 		} else {
 			/* Default static call setting, nothing to do */
 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7588,6 +7634,7 @@ PREEMPT_MODEL_ACCESSOR(none);
 PREEMPT_MODEL_ACCESSOR(voluntary);
 PREEMPT_MODEL_ACCESSOR(full);
 PREEMPT_MODEL_ACCESSOR(lazy);
+PREEMPT_MODEL_ACCESSOR(laziest);
 
 #else /* !CONFIG_PREEMPT_DYNAMIC: */
 
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,9 +245,9 @@ static ssize_t sched_dynamic_write(struc
 static int sched_dynamic_show(struct seq_file *m, void *v)
 {
 	static const char * preempt_modes[] = {
-		"none", "voluntary", "full", "lazy",
+		"none", "voluntary", "full", "lazy", "laziest",
 	};
-	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
+	int j = ARRAY_SIZE(preempt_modes) - 2*!IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
 	int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
 
 	for (; i < j; i++) {



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (4 preceding siblings ...)
  2024-10-07  7:46 ` [PATCH 5/5] sched: Add laziest preempt model Peter Zijlstra
@ 2024-10-07  8:33 ` Sebastian Andrzej Siewior
  2024-10-08  4:58 ` Mike Galbraith
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-07  8:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-07 09:46:09 [+0200], Peter Zijlstra wrote:
> Hi!
Hi,

> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
oh thank you.
I still have an older version in my tree. I will take a look and back to
you.

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (5 preceding siblings ...)
  2024-10-07  8:33 ` [PATCH 0/5] sched: Lazy preemption muck Sebastian Andrzej Siewior
@ 2024-10-08  4:58 ` Mike Galbraith
  2024-10-08 15:32 ` Sebastian Andrzej Siewior
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 57+ messages in thread
From: Mike Galbraith @ 2024-10-08  4:58 UTC (permalink / raw)
  To: Peter Zijlstra, bigeasy, tglx, mingo
  Cc: linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, vschneid, ankur.a.arora

On Mon, 2024-10-07 at 09:46 +0200, Peter Zijlstra wrote:
> Hi!
>
> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
>
> So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
> build boots and can change the mode.
>
> Please have a poke.

My box seems content.

I'm gonna miss VOLATILE (voluntary;) when it's eventually whacked, but
not a lot.. or at all general case, security and whatnot over time have
beaten up high end switchers far worse.

tbench 8 30s - single run, box=i7-4790

master static
  voluntary     3620.45 MB/sec   1.000   mean3 3613.45  stddev3 14.08
  voluntary     2706.54 MB/sec    .747   +cpu_mitigations
  voluntary     4028.72 MB/sec   1.112   nostalgia (4.4.231)
  preempt       3449.35 MB/sec    .952
  none          3631.99 MB/sec   1.003

master dynamic
  none          3548.19 MB/sec    .980
  voluntary     3495.77 MB/sec    .965
  full          3476.50 MB/sec    .960
  lazy          3473.95 MB/sec    .959
  laziest       3492.09 MB/sec    .964

master dynamic rt
  full          2352.58 MB/sec    .649
  lazy          2986.28 MB/sec    .824
  laziest       2977.63 MB/sec    .822

distro kernel dynamic
  none          1744.51 MB/sec    .481
  none          2189.74 MB/sec    .604  mitigations=off


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
@ 2024-10-08  5:43   ` Ankur Arora
  2024-10-08 14:48     ` Peter Zijlstra
  2024-10-09  8:50   ` Sebastian Andrzej Siewior
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 57+ messages in thread
From: Ankur Arora @ 2024-10-08  5:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: bigeasy, tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault


Peter Zijlstra <peterz@infradead.org> writes:

> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
>
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.
>
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
>
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
>
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/preempt.h |    8 ++++-
>  kernel/Kconfig.preempt  |   15 +++++++++
>  kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/sched/debug.c    |    5 +--
>  kernel/sched/fair.c     |    6 +--
>  kernel/sched/sched.h    |    1
>  6 files changed, 103 insertions(+), 8 deletions(-)
>
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>  extern bool preempt_model_none(void);
>  extern bool preempt_model_voluntary(void);
>  extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>
>  #else
>
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>  	return IS_ENABLED(CONFIG_PREEMPT);
>  }
>
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>  #endif
>
>  static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>   */
>  static inline bool preempt_model_preemptible(void)
>  {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>  }

In addition to preempt_model_preemptible() we probably also need

  static inline bool preempt_model_minimize_latency(void)
  {
  	return preempt_model_full() || preempt_model_rt();
  }

for spin_needbreak()/rwlock_needbreak().

That would make the behaviour of spin_needbreak() under the lazy model
similar to none/voluntary.

>  #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>  	select PREEMPTION
>  	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>  choice
>  	prompt "Preemption Model"
>  	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>  	  embedded system with latency requirements in the milliseconds
>  	  range.
>
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>  config PREEMPT_RT
>  	bool "Fully Preemptible Kernel (Real-Time)"
>  	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>
>  	lockdep_assert_rq_held(rq);
>
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +

Tasks with idle policy get handled at the usual user space boundary.
Maybe a comment reflecting that?

>  	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>  		return;
>
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>  	__resched_curr(rq, TIF_NEED_RESCHED);
>  }
>
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);
> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}

Nice. This simplifies things.

> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
>  void resched_cpu(int cpu)
>  {
>  	struct rq *rq = cpu_rq(cpu);
> @@ -5598,6 +5627,10 @@ void sched_tick(void)
>  	update_rq_clock(rq);
>  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> +
> +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> +		resched_curr(rq);
> +

So this works for SCHED_NORMAL. But, does this do the right thing for
deadline etc other scheduling classes?


--
ankur

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 5/5] sched: Add laziest preempt model
  2024-10-07  7:46 ` [PATCH 5/5] sched: Add laziest preempt model Peter Zijlstra
@ 2024-10-08  5:59   ` Ankur Arora
  2024-10-08 14:23   ` Thomas Gleixner
  2024-10-08 15:07   ` Sebastian Andrzej Siewior
  2 siblings, 0 replies; 57+ messages in thread
From: Ankur Arora @ 2024-10-08  5:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: bigeasy, tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault


Peter Zijlstra <peterz@infradead.org> writes:

> Much like LAZY, except lazier still. It will not promote LAZY to full
> preempt on tick and compete with None for suckage.

Yeah, none at least has cond_resched().

Can't think of any workload that would benefit from this though. Maybe
if you have a big cache footprint?

> (do we really wants this?)
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/preempt.h |   10 ++++++++-
>  kernel/Kconfig.preempt  |   12 +++++++++++
>  kernel/sched/core.c     |   49 +++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/debug.c    |    4 +--
>  4 files changed, 71 insertions(+), 4 deletions(-)
>
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -487,6 +487,7 @@ extern bool preempt_model_none(void);
>  extern bool preempt_model_voluntary(void);
>  extern bool preempt_model_full(void);
>  extern bool preempt_model_lazy(void);
> +extern bool preempt_model_laziest(void);
>
>  #else
>
> @@ -507,6 +508,10 @@ static inline bool preempt_model_lazy(vo
>  {
>  	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
>  }
> +static inline bool preempt_model_laziest(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZIEST);
> +}
>
>  #endif
>
> @@ -525,7 +530,10 @@ static inline bool preempt_model_rt(void
>   */
>  static inline bool preempt_model_preemptible(void)
>  {
> -	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
> +	return preempt_model_full() ||
> +	       preempt_model_lazy() ||
> +	       preempt_model_laziest() ||
> +	       preempt_model_rt();
>  }
>
>  #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -84,6 +84,18 @@ config PREEMPT_LAZY
>  	  reduce lock holder preemption and recover some of the performance
>  	  gains seen from using Voluntary preemption.
>
> +config PREEMPT_LAZIEST
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is least
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using no preemption.
> +
>  endchoice
>
>  config PREEMPT_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1108,13 +1108,22 @@ void resched_curr(struct rq *rq)
>
>  #ifdef CONFIG_PREEMPT_DYNAMIC
>  static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_promote);
>  static __always_inline bool dynamic_preempt_lazy(void)
>  {
>  	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
>  }
> +static __always_inline bool dynamic_preempt_promote(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_promote);
> +}
>  #else
>  static __always_inline bool dynamic_preempt_lazy(void)
>  {
> +	return IS_ENABLED(PREEMPT_LAZY) | IS_ENABLED(PREEMPT_LAZIEST);
> +}
> +static __always_inline bool dynamic_preempt_promote(void)
> +{
>  	return IS_ENABLED(PREEMPT_LAZY);
>  }
>  #endif
> @@ -5628,7 +5637,7 @@ void sched_tick(void)
>  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
>
> -	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> +	if (dynamic_preempt_promote() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
>  		resched_curr(rq);
>
>  	curr->sched_class->task_tick(rq, curr, 0);
> @@ -7368,6 +7377,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>   *   preempt_schedule_notrace   <- NOP
>   *   irqentry_exit_cond_resched <- NOP
>   *   dynamic_preempt_lazy       <- false
> + *   dynamic_preempt_promote    <- false
>   *
>   * VOLUNTARY:
>   *   cond_resched               <- __cond_resched
> @@ -7376,6 +7386,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>   *   preempt_schedule_notrace   <- NOP
>   *   irqentry_exit_cond_resched <- NOP
>   *   dynamic_preempt_lazy       <- false
> + *   dynamic_preempt_promote    <- false
>   *
>   * FULL:
>   *   cond_resched               <- RET0
> @@ -7384,6 +7395,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>   *   preempt_schedule_notrace   <- preempt_schedule_notrace
>   *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
>   *   dynamic_preempt_lazy       <- false
> + *   dynamic_preempt_promote    <- false
>   *
>   * LAZY:
>   *   cond_resched               <- RET0
> @@ -7392,6 +7404,16 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>   *   preempt_schedule_notrace   <- preempt_schedule_notrace
>   *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
>   *   dynamic_preempt_lazy       <- true
> + *   dynamic_preempt_promote    <- true
> + *
> + * LAZIEST:
> + *   cond_resched               <- RET0
> + *   might_resched              <- RET0
> + *   preempt_schedule           <- preempt_schedule
> + *   preempt_schedule_notrace   <- preempt_schedule_notrace
> + *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- true
> + *   dynamic_preempt_promote    <- false
>   */
>
>  enum {
> @@ -7400,6 +7422,7 @@ enum {
>  	preempt_dynamic_voluntary,
>  	preempt_dynamic_full,
>  	preempt_dynamic_lazy,
> +	preempt_dynamic_laziest,
>  };
>
>  int preempt_dynamic_mode = preempt_dynamic_undefined;
> @@ -7420,6 +7443,9 @@ int sched_dynamic_mode(const char *str)
>  #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
>  	if (!strcmp(str, "lazy"))
>  		return preempt_dynamic_lazy;
> +
> +	if (!strcmp(str, "laziest"))
> +		return preempt_dynamic_laziest;
>  #endif
>
>  	return -EINVAL;
> @@ -7454,6 +7480,7 @@ static void __sched_dynamic_update(int m
>  	preempt_dynamic_enable(preempt_schedule_notrace);
>  	preempt_dynamic_enable(irqentry_exit_cond_resched);
>  	preempt_dynamic_key_disable(preempt_lazy);
> +	preempt_dynamic_key_disable(preempt_promote);
>
>  	switch (mode) {
>  	case preempt_dynamic_none:
> @@ -7464,6 +7491,7 @@ static void __sched_dynamic_update(int m
>  		preempt_dynamic_disable(preempt_schedule_notrace);
>  		preempt_dynamic_disable(irqentry_exit_cond_resched);
>  		preempt_dynamic_key_disable(preempt_lazy);
> +		preempt_dynamic_key_disable(preempt_promote);
>  		if (mode != preempt_dynamic_mode)
>  			pr_info("Dynamic Preempt: none\n");
>  		break;
> @@ -7476,6 +7504,7 @@ static void __sched_dynamic_update(int m
>  		preempt_dynamic_disable(preempt_schedule_notrace);
>  		preempt_dynamic_disable(irqentry_exit_cond_resched);
>  		preempt_dynamic_key_disable(preempt_lazy);
> +		preempt_dynamic_key_disable(preempt_promote);
>  		if (mode != preempt_dynamic_mode)
>  			pr_info("Dynamic Preempt: voluntary\n");
>  		break;
> @@ -7488,6 +7517,7 @@ static void __sched_dynamic_update(int m
>  		preempt_dynamic_enable(preempt_schedule_notrace);
>  		preempt_dynamic_enable(irqentry_exit_cond_resched);
>  		preempt_dynamic_key_disable(preempt_lazy);
> +		preempt_dynamic_key_disable(preempt_promote);
>  		if (mode != preempt_dynamic_mode)
>  			pr_info("Dynamic Preempt: full\n");
>  		break;
> @@ -7500,9 +7530,23 @@ static void __sched_dynamic_update(int m
>  		preempt_dynamic_enable(preempt_schedule_notrace);
>  		preempt_dynamic_enable(irqentry_exit_cond_resched);
>  		preempt_dynamic_key_enable(preempt_lazy);
> +		preempt_dynamic_key_enable(preempt_promote);
>  		if (mode != preempt_dynamic_mode)
>  			pr_info("Dynamic Preempt: lazy\n");
>  		break;
> +
> +	case preempt_dynamic_laziest:
> +		if (!klp_override)
> +			preempt_dynamic_disable(cond_resched);
> +		preempt_dynamic_disable(might_resched);
> +		preempt_dynamic_enable(preempt_schedule);
> +		preempt_dynamic_enable(preempt_schedule_notrace);
> +		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_enable(preempt_lazy);
> +		preempt_dynamic_key_disable(preempt_promote);
> +		if (mode != preempt_dynamic_mode)
> +			pr_info("Dynamic Preempt: laziest\n");
> +		break;
>  	}
>
>  	preempt_dynamic_mode = mode;
> @@ -7567,6 +7611,8 @@ static void __init preempt_dynamic_init(
>  			sched_dynamic_update(preempt_dynamic_voluntary);
>  		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
>  			sched_dynamic_update(preempt_dynamic_lazy);
> +		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZIEST)) {
> +			sched_dynamic_update(preempt_dynamic_laziest);
>  		} else {
>  			/* Default static call setting, nothing to do */
>  			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
> @@ -7588,6 +7634,7 @@ PREEMPT_MODEL_ACCESSOR(none);
>  PREEMPT_MODEL_ACCESSOR(voluntary);
>  PREEMPT_MODEL_ACCESSOR(full);
>  PREEMPT_MODEL_ACCESSOR(lazy);
> +PREEMPT_MODEL_ACCESSOR(laziest);
>
>  #else /* !CONFIG_PREEMPT_DYNAMIC: */
>
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -245,9 +245,9 @@ static ssize_t sched_dynamic_write(struc
>  static int sched_dynamic_show(struct seq_file *m, void *v)
>  {
>  	static const char * preempt_modes[] = {
> -		"none", "voluntary", "full", "lazy",
> +		"none", "voluntary", "full", "lazy", "laziest",
>  	};
> -	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
> +	int j = ARRAY_SIZE(preempt_modes) - 2*!IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
>  	int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
>
>  	for (; i < j; i++) {


--
ankur

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT
  2024-10-07  7:46 ` [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT Peter Zijlstra
@ 2024-10-08 13:24   ` Sebastian Andrzej Siewior
  2024-10-08 14:40     ` Peter Zijlstra
  2024-10-10  6:52   ` Christoph Hellwig
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2 siblings, 1 reply; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-08 13:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault, Clark Williams

On 2024-10-07 09:46:12 [+0200], Peter Zijlstra wrote:
> In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT
> from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is
> not a change in how preemption works, but rather it makes a ton more
> code preemptible.
> 
> Notably, take away NONE and VOLATILE options for PREEMPT_RT, they make
> no sense (but are techincally possible).

So this is what we do. Okay. This means we can enable the DYNAMIC mode
on PREEMPT_RT enabled kernels and switch between "full" and the "lazy"
mode(s).
On PREEMPT_RT enabled kernels with PREEMPT_DYNAMIC the UTS_VERSION
string is set to PREEMPT_RT and PREEMPT_DYNAMIC is not exposed. Is this
on purpose or just happened?

Clark was asking for a file to expose whether or not PREEMPT_RT is
enabled and I was pointing him to UTS_VERSION but then suggested that it
might be possible if we expose the current setting of the preemption
model and use this. 
But with this it won't work.
I am not sure if PREEMPT_DYNAMIC is needed to be exposed and if
everybody is happy parsing UTS_VERSION (we used to have a
/sys/kernel/realtime file in the RT queue).

> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 5/5] sched: Add laziest preempt model
  2024-10-07  7:46 ` [PATCH 5/5] sched: Add laziest preempt model Peter Zijlstra
  2024-10-08  5:59   ` Ankur Arora
@ 2024-10-08 14:23   ` Thomas Gleixner
  2024-10-08 14:40     ` Peter Zijlstra
  2024-10-08 15:07   ` Sebastian Andrzej Siewior
  2 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-08 14:23 UTC (permalink / raw)
  To: Peter Zijlstra, bigeasy, mingo
  Cc: linux-kernel, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Mon, Oct 07 2024 at 09:46, Peter Zijlstra wrote:
> Much like LAZY, except lazier still. It will not promote LAZY to full
> preempt on tick and compete with None for suckage.
>
> (do we really wants this?)

Just to prove how bad NONE is without cond_resched() ?

I'm sure we can do without that experiment.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT
  2024-10-08 13:24   ` Sebastian Andrzej Siewior
@ 2024-10-08 14:40     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-08 14:40 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault, Clark Williams

On Tue, Oct 08, 2024 at 03:24:16PM +0200, Sebastian Andrzej Siewior wrote:
> On 2024-10-07 09:46:12 [+0200], Peter Zijlstra wrote:
> > In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT
> > from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is
> > not a change in how preemption works, but rather it makes a ton more
> > code preemptible.
> > 
> > Notably, take away NONE and VOLATILE options for PREEMPT_RT, they make

As I think Mike already noted, typing is hard and this should of course
have been Voluntary :-)

> > no sense (but are techincally possible).
> 
> So this is what we do. Okay. This means we can enable the DYNAMIC mode
> on PREEMPT_RT enabled kernels and switch between "full" and the "lazy"
> mode(s).

Right.

> On PREEMPT_RT enabled kernels with PREEMPT_DYNAMIC the UTS_VERSION
> string is set to PREEMPT_RT and PREEMPT_DYNAMIC is not exposed. Is this
> on purpose or just happened?

I noticed it, didn't care and promptly forgot about it again :-)

> Clark was asking for a file to expose whether or not PREEMPT_RT is
> enabled and I was pointing him to UTS_VERSION but then suggested that it
> might be possible if we expose the current setting of the preemption
> model and use this. 
> But with this it won't work.
> I am not sure if PREEMPT_DYNAMIC is needed to be exposed and if
> everybody is happy parsing UTS_VERSION (we used to have a
> /sys/kernel/realtime file in the RT queue).

Yeah, IDK. This is all just pick a colour :-)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 5/5] sched: Add laziest preempt model
  2024-10-08 14:23   ` Thomas Gleixner
@ 2024-10-08 14:40     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-08 14:40 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: bigeasy, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Tue, Oct 08, 2024 at 04:23:36PM +0200, Thomas Gleixner wrote:
> On Mon, Oct 07 2024 at 09:46, Peter Zijlstra wrote:
> > Much like LAZY, except lazier still. It will not promote LAZY to full
> > preempt on tick and compete with None for suckage.
> >
> > (do we really wants this?)
> 
> Just to prove how bad NONE is without cond_resched() ?
> 
> I'm sure we can do without that experiment.

There's a reason this is the very last patch :-)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-08  5:43   ` Ankur Arora
@ 2024-10-08 14:48     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-08 14:48 UTC (permalink / raw)
  To: Ankur Arora
  Cc: bigeasy, tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, efault

On Mon, Oct 07, 2024 at 10:43:58PM -0700, Ankur Arora wrote:

> > @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
> >   */
> >  static inline bool preempt_model_preemptible(void)
> >  {
> > -	return preempt_model_full() || preempt_model_rt();
> > +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
> >  }
> 
> In addition to preempt_model_preemptible() we probably also need
> 
>   static inline bool preempt_model_minimize_latency(void)
>   {
>   	return preempt_model_full() || preempt_model_rt();
>   }
> 
> for spin_needbreak()/rwlock_needbreak().
> 
> That would make the behaviour of spin_needbreak() under the lazy model
> similar to none/voluntary.

That whole thing needs rethinking, for one the preempt_model_rt() one
doesn't really make sense anymore at the end of this.

> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
> >
> >  	lockdep_assert_rq_held(rq);
> >
> > +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> > +		tif = TIF_NEED_RESCHED;
> > +
> 
> Tasks with idle policy get handled at the usual user space boundary.
> Maybe a comment reflecting that?

is_idle_task() != SCHED_IDLE. This is about the idle task, which you
want to force preempt always. But I can stick a comment on.

> > @@ -5598,6 +5627,10 @@ void sched_tick(void)
> >  	update_rq_clock(rq);
> >  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> >  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> > +
> > +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> > +		resched_curr(rq);
> > +
> 
> So this works for SCHED_NORMAL. But, does this do the right thing for
> deadline etc other scheduling classes?

Yeah, only fair.c uses resched_curr_laz8(), the others still use
resched_curr() and will work as if Full.

So that is: SCHED_IDLE, SCHED_BATCH and SCHED_NORMAL/OTHER get the lazy
thing, FIFO, RR and DEADLINE get the traditional Full behaviour.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 5/5] sched: Add laziest preempt model
  2024-10-07  7:46 ` [PATCH 5/5] sched: Add laziest preempt model Peter Zijlstra
  2024-10-08  5:59   ` Ankur Arora
  2024-10-08 14:23   ` Thomas Gleixner
@ 2024-10-08 15:07   ` Sebastian Andrzej Siewior
  2 siblings, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-08 15:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-07 09:46:14 [+0200], Peter Zijlstra wrote:
> Much like LAZY, except lazier still. It will not promote LAZY to full
> preempt on tick and compete with None for suckage.
> 
> (do we really wants this?)

This is like NONE/ VOLUNTARY without the .*_resched().
irqentry_exit_cond_resched() and preempt_schedule.*() does nothing
because only the lazy bit is set. This should trigger all the spots
which were filled with cond_resched() to avoid warnings, right?
There is nothing that will force a preemption, right?

> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
…

The description is the same for two lazy models.

> +config PREEMPT_LAZIEST
> +	bool "Scheduler controlled preemption model"
	bool "Scheduler controlled preemption model (relaxed)"

> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is least
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using no preemption.

          The scheduler won't force the task off-CPU if the task does
	  not give up voluntary.
> +
>  endchoice

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (6 preceding siblings ...)
  2024-10-08  4:58 ` Mike Galbraith
@ 2024-10-08 15:32 ` Sebastian Andrzej Siewior
  2024-10-09  4:40   ` Ankur Arora
                     ` (2 more replies)
  2024-10-09 11:07 ` Sebastian Andrzej Siewior
  2024-10-17 12:36 ` Mike Galbraith
  9 siblings, 3 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-08 15:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-07 09:46:09 [+0200], Peter Zijlstra wrote:
> Hi!
> 
> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
> 
> So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
> build boots and can change the mode.
> 
> Please have a poke.

While comparing this vs what I have:
- need_resched()
  It checked both (tif_need_resched_lazy() || tif_need_resched()) while
  now it only looks at tif_need_resched().
  Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
  lazy. 
  I guess you can argue both ways what makes sense, just noting…

- __account_cfs_rq_runtime() and hrtick_start_fair()
  Both have a resched_curr() instead of resched_curr_lazy(). Is this on
  purpose?

This is actually the main difference (ignoring the moving the RT bits
and dynamic-sched). The lazy-resched is slightly different but it should
do the same thing.
I have also tracing and riscv bits which I port tomorrow, test and add
to your pile.

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-08 15:32 ` Sebastian Andrzej Siewior
@ 2024-10-09  4:40   ` Ankur Arora
  2024-10-09  6:20     ` Sebastian Andrzej Siewior
  2024-10-09  7:30   ` Ankur Arora
  2024-10-09  7:46   ` Peter Zijlstra
  2 siblings, 1 reply; 57+ messages in thread
From: Ankur Arora @ 2024-10-09  4:40 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, ankur.a.arora, efault


Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:

> On 2024-10-07 09:46:09 [+0200], Peter Zijlstra wrote:
>> Hi!
>>
>> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
>>
>> So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
>> build boots and can change the mode.
>>
>> Please have a poke.
>
> While comparing this vs what I have:
> - need_resched()
>   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
>   now it only looks at tif_need_resched().
>   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
>   lazy.
>   I guess you can argue both ways what makes sense, just noting…

I think we want need_resched() to be only tif_need_resched(). That way
preemption in lazy mode *only* happens at the user mode boundary.

If the scheduler wants to preempt imminently, it just sets (or upgrades to)
TIF_NEED_RESCHED.

> - __account_cfs_rq_runtime() and hrtick_start_fair()
>   Both have a resched_curr() instead of resched_curr_lazy(). Is this on
>   purpose?
>
> This is actually the main difference (ignoring the moving the RT bits
> and dynamic-sched). The lazy-resched is slightly different but it should
> do the same thing.
> I have also tracing and riscv bits which I port tomorrow, test and add
> to your pile.
>
> Sebastian


--
ankur

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09  4:40   ` Ankur Arora
@ 2024-10-09  6:20     ` Sebastian Andrzej Siewior
  2024-10-09  7:23       ` Ankur Arora
  2024-10-09  8:02       ` Peter Zijlstra
  0 siblings, 2 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-09  6:20 UTC (permalink / raw)
  To: Ankur Arora
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, efault

On 2024-10-08 21:40:05 [-0700], Ankur Arora wrote:
> > While comparing this vs what I have:
> > - need_resched()
> >   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
> >   now it only looks at tif_need_resched().
> >   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
> >   lazy.
> >   I guess you can argue both ways what makes sense, just noting…
> 
> I think we want need_resched() to be only tif_need_resched(). That way
> preemption in lazy mode *only* happens at the user mode boundary.

There are places such as __clear_extent_bit() or select_collect() where
need_resched() is checked and if 0 they loop again. For these kind of
users it would probably make sense to allow them to preempt themself.
We could also add a new function which checks both and audit all users
and check what would make sense base on $criteria.

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09  6:20     ` Sebastian Andrzej Siewior
@ 2024-10-09  7:23       ` Ankur Arora
  2024-10-09  8:02       ` Peter Zijlstra
  1 sibling, 0 replies; 57+ messages in thread
From: Ankur Arora @ 2024-10-09  7:23 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Ankur Arora, Peter Zijlstra, tglx, mingo, linux-kernel,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vschneid, efault


Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:

> On 2024-10-08 21:40:05 [-0700], Ankur Arora wrote:
>> > While comparing this vs what I have:
>> > - need_resched()
>> >   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
>> >   now it only looks at tif_need_resched().
>> >   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
>> >   lazy.
>> >   I guess you can argue both ways what makes sense, just noting…
>>
>> I think we want need_resched() to be only tif_need_resched(). That way
>> preemption in lazy mode *only* happens at the user mode boundary.
>
> There are places such as __clear_extent_bit() or select_collect() where
> need_resched() is checked and if 0 they loop again. For these kind of
> users it would probably make sense to allow them to preempt themself.
> We could also add a new function which checks both and audit all users
> and check what would make sense base on $criteria.

Yeah, I remember having the same thought. But the problem is that the
need_resched() checks are all over the kernel. And, figuring out a good
criteria for each of them seems like it might be similar to the
placement problem for cond_resched() -- both being workload dependent.

And, given that the maximum time in the lazy state is limited, it seems
like it'll be simplest to just circumscribe the time spent in the lazy
state by upgrading to TIF_NEED_RESCHED based on a some time limit.

That seems to do the job quite well, as Thomas' hog example showed:
 https://lore.kernel.org/lkml/87jzshhexi.ffs@tglx/

--
ankur

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-08 15:32 ` Sebastian Andrzej Siewior
  2024-10-09  4:40   ` Ankur Arora
@ 2024-10-09  7:30   ` Ankur Arora
  2024-10-09  7:46   ` Peter Zijlstra
  2 siblings, 0 replies; 57+ messages in thread
From: Ankur Arora @ 2024-10-09  7:30 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, ankur.a.arora, efault


Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:

> On 2024-10-07 09:46:09 [+0200], Peter Zijlstra wrote:
>> Hi!
>>
>> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
>>
>> So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
>> build boots and can change the mode.
>>
>> Please have a poke.
>
> I have also tracing and riscv bits which I port tomorrow, test and add
> to your pile.

I also have a set of RCU, powerpc, and tracing bits. Since you are
working on the tracing part as well, maybe we can just unify our
patches before sending to Peter?

My patches are at:

 git://github.com/terminus/linux/ sched/lazy

--
ankur

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-08 15:32 ` Sebastian Andrzej Siewior
  2024-10-09  4:40   ` Ankur Arora
  2024-10-09  7:30   ` Ankur Arora
@ 2024-10-09  7:46   ` Peter Zijlstra
  2 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-09  7:46 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Tue, Oct 08, 2024 at 05:32:32PM +0200, Sebastian Andrzej Siewior wrote:
> On 2024-10-07 09:46:09 [+0200], Peter Zijlstra wrote:
> > Hi!
> > 
> > During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
> > 
> > So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
> > build boots and can change the mode.
> > 
> > Please have a poke.
> 
> While comparing this vs what I have:
> - need_resched()
>   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
>   now it only looks at tif_need_resched().
>   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
>   lazy. 
>   I guess you can argue both ways what makes sense, just noting…

Right, changing need_resched() would mean auditing all existing users,
and there's *MANY* of them.

> - __account_cfs_rq_runtime() and hrtick_start_fair()
>   Both have a resched_curr() instead of resched_curr_lazy(). Is this on
>   purpose?

Sorta, so __account_cfs_rq_runtime() is bandwidth enforcement, and I
figured that since bandwidth is like resource isolation it should be as
accurate as possible.

hrtick is disabled by default (because expensive) and so it doesn't
matter much, but it's purpose is to increase accuracy and hence I left
it untouched for now.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09  6:20     ` Sebastian Andrzej Siewior
  2024-10-09  7:23       ` Ankur Arora
@ 2024-10-09  8:02       ` Peter Zijlstra
  2024-10-09  8:45         ` Sebastian Andrzej Siewior
  2024-10-09 14:01         ` Steven Rostedt
  1 sibling, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-09  8:02 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Ankur Arora, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, efault

On Wed, Oct 09, 2024 at 08:20:19AM +0200, Sebastian Andrzej Siewior wrote:
> On 2024-10-08 21:40:05 [-0700], Ankur Arora wrote:
> > > While comparing this vs what I have:
> > > - need_resched()
> > >   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
> > >   now it only looks at tif_need_resched().
> > >   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
> > >   lazy.
> > >   I guess you can argue both ways what makes sense, just noting…
> > 
> > I think we want need_resched() to be only tif_need_resched(). That way
> > preemption in lazy mode *only* happens at the user mode boundary.
> 
> There are places such as __clear_extent_bit() or select_collect() where
> need_resched() is checked and if 0 they loop again. For these kind of
> users it would probably make sense to allow them to preempt themself.
> We could also add a new function which checks both and audit all users
> and check what would make sense base on $criteria.

Do we really need this -- wasn't the idea to have thing 'delay' until
the actual NEED_RESCHED bit gets set?


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09  8:02       ` Peter Zijlstra
@ 2024-10-09  8:45         ` Sebastian Andrzej Siewior
  2024-10-09 14:01         ` Steven Rostedt
  1 sibling, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-09  8:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ankur Arora, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, efault

On 2024-10-09 10:02:02 [+0200], Peter Zijlstra wrote:
> > There are places such as __clear_extent_bit() or select_collect() where
> > need_resched() is checked and if 0 they loop again. For these kind of
> > users it would probably make sense to allow them to preempt themself.
> > We could also add a new function which checks both and audit all users
> > and check what would make sense base on $criteria.
> 
> Do we really need this -- wasn't the idea to have thing 'delay' until
> the actual NEED_RESCHED bit gets set?

Not sure. They asked for it and have a lock acquired. But I guess it
could make sense to do as much work as possible until they have finally
to go.
This what we used to have in the RT tree, I don't complain just compare
notes.

Is there any punishment from the scheduler if they get preempted vs
leave early? It is just the time slice they used up and become less
eligible next time?

Let me do some more testing, this looks good so far.

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
  2024-10-08  5:43   ` Ankur Arora
@ 2024-10-09  8:50   ` Sebastian Andrzej Siewior
  2024-10-09  9:14     ` Peter Zijlstra
  2024-10-15 14:37   ` Shrikanth Hegde
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-09  8:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-07 09:46:11 [+0200], Peter Zijlstra wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> +static __always_inline int tif_need_resched_lazy(void)

The naming is a bit confusing here because tif_need_resched() checks if
the TIF_NEED_RESCHED is set while this returns the proper TIF bit
instead.

> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-09  8:50   ` Sebastian Andrzej Siewior
@ 2024-10-09  9:14     ` Peter Zijlstra
  2024-10-09  9:19       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-09  9:14 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Wed, Oct 09, 2024 at 10:50:21AM +0200, Sebastian Andrzej Siewior wrote:
> On 2024-10-07 09:46:11 [+0200], Peter Zijlstra wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> …
> > +static __always_inline int tif_need_resched_lazy(void)
> 
> The naming is a bit confusing here because tif_need_resched() checks if
> the TIF_NEED_RESCHED is set while this returns the proper TIF bit
> instead.

Right you are; naming things be hard. How about: get_lazy_tif_bit() ?
There's only the single user anyway.

> > +{
> > +	if (dynamic_preempt_lazy())
> > +		return TIF_NEED_RESCHED_LAZY;
> > +
> > +	return TIF_NEED_RESCHED;
> > +}
> 
> Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-09  9:14     ` Peter Zijlstra
@ 2024-10-09  9:19       ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-09  9:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-09 11:14:01 [+0200], Peter Zijlstra wrote:
> On Wed, Oct 09, 2024 at 10:50:21AM +0200, Sebastian Andrzej Siewior wrote:
> > On 2024-10-07 09:46:11 [+0200], Peter Zijlstra wrote:
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> > …
> > > +static __always_inline int tif_need_resched_lazy(void)
> > 
> > The naming is a bit confusing here because tif_need_resched() checks if
> > the TIF_NEED_RESCHED is set while this returns the proper TIF bit
> > instead.
> 
> Right you are; naming things be hard. How about: get_lazy_tif_bit() ?
> There's only the single user anyway.

perfect.

> > > +{
> > > +	if (dynamic_preempt_lazy())
> > > +		return TIF_NEED_RESCHED_LAZY;
> > > +
> > > +	return TIF_NEED_RESCHED;
> > > +}

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (7 preceding siblings ...)
  2024-10-08 15:32 ` Sebastian Andrzej Siewior
@ 2024-10-09 11:07 ` Sebastian Andrzej Siewior
  2024-10-17 12:36 ` Mike Galbraith
  9 siblings, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-09 11:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-07 09:46:09 [+0200], Peter Zijlstra wrote:
> Hi!
> 
> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
> 
> So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
> build boots and can change the mode.
> 
> Please have a poke.

I poked. Very happy. Can change mode as well.

Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure
  2024-10-07  7:46 ` [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Peter Zijlstra
@ 2024-10-09 12:18   ` Sebastian Andrzej Siewior
  2024-10-09 13:01     ` Peter Zijlstra
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-09 12:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On 2024-10-07 09:46:10 [+0200], Peter Zijlstra wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -964,9 +963,9 @@ static bool set_nr_if_polling(struct tas
>  }
>  
>  #else
> -static inline bool set_nr_and_not_polling(struct task_struct *p)
> +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
>  {
> -	set_tsk_need_resched(p);
> +	atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);

	set_ti_thread_flag(ti, tif);
?
Not sure if there is a benefit over atomic_long_or() but you could avoid
the left shift. I know. Performance critical.

>  	return true;
>  }
>  

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure
  2024-10-09 12:18   ` Sebastian Andrzej Siewior
@ 2024-10-09 13:01     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-09 13:01 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Wed, Oct 09, 2024 at 02:18:59PM +0200, Sebastian Andrzej Siewior wrote:
> On 2024-10-07 09:46:10 [+0200], Peter Zijlstra wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -964,9 +963,9 @@ static bool set_nr_if_polling(struct tas
> >  }
> >  
> >  #else
> > -static inline bool set_nr_and_not_polling(struct task_struct *p)
> > +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
> >  {
> > -	set_tsk_need_resched(p);
> > +	atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);
> 
> 	set_ti_thread_flag(ti, tif);
> ?
> Not sure if there is a benefit over atomic_long_or() but you could avoid
> the left shift. I know. Performance critical.

:-)

Yeah, I suppose set_ti_thread_flag() works just fine here.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09  8:02       ` Peter Zijlstra
  2024-10-09  8:45         ` Sebastian Andrzej Siewior
@ 2024-10-09 14:01         ` Steven Rostedt
  2024-10-09 20:13           ` Thomas Gleixner
  1 sibling, 1 reply; 57+ messages in thread
From: Steven Rostedt @ 2024-10-09 14:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Sebastian Andrzej Siewior, Ankur Arora, tglx, mingo, linux-kernel,
	juri.lelli, vincent.guittot, dietmar.eggemann, bsegall, mgorman,
	vschneid, efault

On Wed, 9 Oct 2024 10:02:02 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Oct 09, 2024 at 08:20:19AM +0200, Sebastian Andrzej Siewior wrote:
> > On 2024-10-08 21:40:05 [-0700], Ankur Arora wrote:  
> > > > While comparing this vs what I have:
> > > > - need_resched()
> > > >   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
> > > >   now it only looks at tif_need_resched().
> > > >   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
> > > >   lazy.
> > > >   I guess you can argue both ways what makes sense, just noting…  
> > > 
> > > I think we want need_resched() to be only tif_need_resched(). That way
> > > preemption in lazy mode *only* happens at the user mode boundary.  
> > 
> > There are places such as __clear_extent_bit() or select_collect() where
> > need_resched() is checked and if 0 they loop again. For these kind of
> > users it would probably make sense to allow them to preempt themself.
> > We could also add a new function which checks both and audit all users
> > and check what would make sense base on $criteria.  
> 
> Do we really need this -- wasn't the idea to have thing 'delay' until
> the actual NEED_RESCHED bit gets set?

If we think about it as what would happen with the current PREEMPT_NONE,
wouldn't need_resched() return true as soon as NEED_RESCHED is set? That
means, with PREEMPT_AUTO, it should return true if LAZY_NEED_RESCHED is
set, right? That would make it act the same in both cases.

-- Steve

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 14:01         ` Steven Rostedt
@ 2024-10-09 20:13           ` Thomas Gleixner
  2024-10-09 20:43             ` Steven Rostedt
  0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-09 20:13 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra
  Cc: Sebastian Andrzej Siewior, Ankur Arora, mingo, linux-kernel,
	juri.lelli, vincent.guittot, dietmar.eggemann, bsegall, mgorman,
	vschneid, efault

On Wed, Oct 09 2024 at 10:01, Steven Rostedt wrote:
> On Wed, 9 Oct 2024 10:02:02 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
>
>> On Wed, Oct 09, 2024 at 08:20:19AM +0200, Sebastian Andrzej Siewior wrote:
>> > On 2024-10-08 21:40:05 [-0700], Ankur Arora wrote:  
>> > > > While comparing this vs what I have:
>> > > > - need_resched()
>> > > >   It checked both (tif_need_resched_lazy() || tif_need_resched()) while
>> > > >   now it only looks at tif_need_resched().
>> > > >   Also ensured that raw_irqentry_exit_cond_resched() does not trigger on
>> > > >   lazy.
>> > > >   I guess you can argue both ways what makes sense, just noting…  
>> > > 
>> > > I think we want need_resched() to be only tif_need_resched(). That way
>> > > preemption in lazy mode *only* happens at the user mode boundary.  
>> > 
>> > There are places such as __clear_extent_bit() or select_collect() where
>> > need_resched() is checked and if 0 they loop again. For these kind of
>> > users it would probably make sense to allow them to preempt themself.
>> > We could also add a new function which checks both and audit all users
>> > and check what would make sense base on $criteria.  
>> 
>> Do we really need this -- wasn't the idea to have thing 'delay' until
>> the actual NEED_RESCHED bit gets set?
>
> If we think about it as what would happen with the current PREEMPT_NONE,
> wouldn't need_resched() return true as soon as NEED_RESCHED is set? That
> means, with PREEMPT_AUTO, it should return true if LAZY_NEED_RESCHED is
> set, right? That would make it act the same in both cases.

I don't think so. Quite some of these need_resched() things have been
sprinkled around to address the issues with PREEMPT_NONE.

We need to look at those places and figure out whether they need it when
LAZY is enabled. There might be a few which want to look at both flags,
but my expectation is that those are less than 5% of the overall usage.

Peter's choice is the right one. That forces us to look at all of them
and figure out whether they need to be extended to include the lazy bit
or not. Those which do not need it can be eliminated when LAZY is in
effect because that will preempt on the next possible preemption point
once the non-lazy bit is set in the tick.

Remember, the goal is to eliminate all except LAZY (RT is a different
scope) and make the kernel behave correctly to the expectation of LAZY
(somewhere between NONE and VOLUNTARY) and non-LAZY (equivalent to
FULL).

The reason why this works is that preempt_enable() actually has a
meaning while it does not with NONE.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 20:13           ` Thomas Gleixner
@ 2024-10-09 20:43             ` Steven Rostedt
  2024-10-09 21:06               ` Thomas Gleixner
  2024-10-10  3:12               ` Tianchen Ding
  0 siblings, 2 replies; 57+ messages in thread
From: Steven Rostedt @ 2024-10-09 20:43 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Wed, 09 Oct 2024 22:13:51 +0200
Thomas Gleixner <tglx@linutronix.de> wrote:

> > If we think about it as what would happen with the current PREEMPT_NONE,
> > wouldn't need_resched() return true as soon as NEED_RESCHED is set? That
> > means, with PREEMPT_AUTO, it should return true if LAZY_NEED_RESCHED is
> > set, right? That would make it act the same in both cases.  
> 
> I don't think so. Quite some of these need_resched() things have been
> sprinkled around to address the issues with PREEMPT_NONE.

My point is that the need_resched() that are sprinkled around are checking
if a schedule has been requested or not. From that point alone,
LAZY_NEED_RESCHED is something that requested a schedule.

> 
> We need to look at those places and figure out whether they need it when
> LAZY is enabled. There might be a few which want to look at both flags,
> but my expectation is that those are less than 5% of the overall usage.
> 
> Peter's choice is the right one. That forces us to look at all of them
> and figure out whether they need to be extended to include the lazy bit
> or not. Those which do not need it can be eliminated when LAZY is in
> effect because that will preempt on the next possible preemption point
> once the non-lazy bit is set in the tick.
> 
> Remember, the goal is to eliminate all except LAZY (RT is a different
> scope) and make the kernel behave correctly to the expectation of LAZY
> (somewhere between NONE and VOLUNTARY) and non-LAZY (equivalent to
> FULL).
> 
> The reason why this works is that preempt_enable() actually has a
> meaning while it does not with NONE.

Looking around, I see the pattern of checking need_resched() from within a
loop where a spinlock is held. Then after the break of the loop and release
of the spinlock, cond_resched() is checked, and the loop is entered again.

Thus, I guess this is the reason you are saying that it should just check
NEED_RESCHED and not the LAZY variant. Because if we remove that
cond_resched() then it would just re-enter the loop again with the LAZY
being set.

Hmm, but then again...

Perhaps these cond_resched() is proper? That is, the need_resched() /
cond_resched() is not something that is being done for PREEMPT_NONE, but
for preempt/voluntary kernels too. Maybe these cond_resched() should stay?
If we spin in the loop for one more tick, that is actually changing the
behavior of PREEMPT_NONE and PREEMPT_VOLUNTARY, as the need_resched()/cond_resched()
helps with latency. If we just wait for the next tick, these loops (and
there's a lot of them) will all now run for one tick longer than if
PREEMPT_NONE or PREEMPT_VOLUNTARY were set today.


-- Steve

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 20:43             ` Steven Rostedt
@ 2024-10-09 21:06               ` Thomas Gleixner
  2024-10-09 21:19                 ` Steven Rostedt
  2024-10-10 10:23                 ` David Laight
  2024-10-10  3:12               ` Tianchen Ding
  1 sibling, 2 replies; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-09 21:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Wed, Oct 09 2024 at 16:43, Steven Rostedt wrote:
> On Wed, 09 Oct 2024 22:13:51 +0200
> Thomas Gleixner <tglx@linutronix.de> wrote:
>> The reason why this works is that preempt_enable() actually has a
>> meaning while it does not with NONE.
>
> Looking around, I see the pattern of checking need_resched() from within a
> loop where a spinlock is held. Then after the break of the loop and release
> of the spinlock, cond_resched() is checked, and the loop is entered again.
>
> Thus, I guess this is the reason you are saying that it should just check
> NEED_RESCHED and not the LAZY variant. Because if we remove that
> cond_resched() then it would just re-enter the loop again with the LAZY
> being set.
>
> Hmm, but then again...
>
> Perhaps these cond_resched() is proper? That is, the need_resched() /
> cond_resched() is not something that is being done for PREEMPT_NONE, but
> for preempt/voluntary kernels too. Maybe these cond_resched() should stay?
> If we spin in the loop for one more tick, that is actually changing the
> behavior of PREEMPT_NONE and PREEMPT_VOLUNTARY, as the need_resched()/cond_resched()
> helps with latency. If we just wait for the next tick, these loops (and
> there's a lot of them) will all now run for one tick longer than if
> PREEMPT_NONE or PREEMPT_VOLUNTARY were set today.

You are looking at it from the wrong perspective. You are trying to
preserve the status quo. I know that's the path of least effort but it's
the fundamentally wrong approach.

    spin_lock(L);
    while ($cond) {
          do_stuff();
          if (need_resched()) {
          	spin_unlock(L);
                resched();
                spin_lock(L);
          }
    }
    spin_unlock(L);

is the bogus pattern which was introduced to deal with the NONE
shortcomings. That's what we want to get rid of and not proliferate.

For the transition phase we obviously need to do:

    while ($cond) {
          spin_lock(L);
          do_stuff();
          spin_unlock(L);
          cond_resched();
    }

And once all the problems with LAZY are sorted then this cond_resched()
line just goes away and the loop looks like this:

    while ($cond) {
          spin_lock(L);
          do_stuff();
          spin_unlock(L);
    }

There is absolutely no argument that the spinlock held section needs to
spawn the loop. We fixed up several of these constructs over the years
and none of them caused a performance regression. Quite the contrary
quite some of them improved performance because dropping the lock lets
other CPUs interleave.

Seriously, this crap preserving mindset has to stop. If we go new ways
then we go them all the way.

There is a cost involved for cleaning the existing crap up, but that
cost is well worth it, because anything else is just adding to the ever
increasing accumulation of technical debt. We have enough of that
already.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 21:06               ` Thomas Gleixner
@ 2024-10-09 21:19                 ` Steven Rostedt
  2024-10-09 23:16                   ` Thomas Gleixner
  2024-10-10 10:23                 ` David Laight
  1 sibling, 1 reply; 57+ messages in thread
From: Steven Rostedt @ 2024-10-09 21:19 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Wed, 09 Oct 2024 23:06:00 +0200
Thomas Gleixner <tglx@linutronix.de> wrote:

> > Perhaps these cond_resched() is proper? That is, the need_resched() /
> > cond_resched() is not something that is being done for PREEMPT_NONE, but
> > for preempt/voluntary kernels too. Maybe these cond_resched() should stay?
> > If we spin in the loop for one more tick, that is actually changing the
> > behavior of PREEMPT_NONE and PREEMPT_VOLUNTARY, as the need_resched()/cond_resched()
> > helps with latency. If we just wait for the next tick, these loops (and
> > there's a lot of them) will all now run for one tick longer than if
> > PREEMPT_NONE or PREEMPT_VOLUNTARY were set today.  
> 
> You are looking at it from the wrong perspective. You are trying to
> preserve the status quo. I know that's the path of least effort but it's
> the fundamentally wrong approach.
> 
>     spin_lock(L);
>     while ($cond) {
>           do_stuff();
>           if (need_resched()) {
>           	spin_unlock(L);
>                 resched();
>                 spin_lock(L);
>           }
>     }
>     spin_unlock(L);
> 
> is the bogus pattern which was introduced to deal with the NONE
> shortcomings. That's what we want to get rid of and not proliferate.

So this is actually a different issue that you are trying to address. But I
don't see how it had to deal with NONE, as even PREEMPT would suffer from
that loop, right? As the you can't preempt while holding the spinlock.

> 
> For the transition phase we obviously need to do:
> 
>     while ($cond) {
>           spin_lock(L);
>           do_stuff();
>           spin_unlock(L);
>           cond_resched();
>     }

But if $cond needs to be protected by spin_lock(), what then?

	spin_lock();
	while ($cond) {
		do_stuff();
		spin_unlock();
		spin_lock();
	}
	spin_unlock();

?

> 
> And once all the problems with LAZY are sorted then this cond_resched()
> line just goes away and the loop looks like this:
> 
>     while ($cond) {
>           spin_lock(L);
>           do_stuff();
>           spin_unlock(L);
>     }
> 
> There is absolutely no argument that the spinlock held section needs to
> spawn the loop. We fixed up several of these constructs over the years
> and none of them caused a performance regression. Quite the contrary
> quite some of them improved performance because dropping the lock lets
> other CPUs interleave.
> 
> Seriously, this crap preserving mindset has to stop. If we go new ways
> then we go them all the way.

It's not about "crap preserving" but more of taking smaller steps. Then we
can see where a regression happened if one does come up. Kind of like how
you did the x86 64bit/32bit merge. Do steps that keep things as close to
what they were at the start and slowly move toward your goals.

-- Steve

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 21:19                 ` Steven Rostedt
@ 2024-10-09 23:16                   ` Thomas Gleixner
  2024-10-09 23:29                     ` Steven Rostedt
  0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-09 23:16 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Wed, Oct 09 2024 at 17:19, Steven Rostedt wrote:
> On Wed, 09 Oct 2024 23:06:00 +0200
> Thomas Gleixner <tglx@linutronix.de> wrote:
>> For the transition phase we obviously need to do:
>> 
>>     while ($cond) {
>>           spin_lock(L);
>>           do_stuff();
>>           spin_unlock(L);
>>           cond_resched();
>>     }
>
> But if $cond needs to be protected by spin_lock(), what then?
>
> 	spin_lock();
> 	while ($cond) {
> 		do_stuff();
> 		spin_unlock();
> 		spin_lock();
> 	}
> 	spin_unlock();
>

Seriously? The proper pattern for this is to do either:

	while (READ_ONCE($cond)) {
		scoped_guard(spinlock)(L) {
                	if ($cond)
                		break;
			do_stuff();
                }
		cond_resched(); // To be removed
	}

or in case that $cond is more complex and needs lock protection:

	while (true) {
		scoped_guard(spinlock)(L) {
                	if ($cond)
                		break;
			do_stuff();
                }
		cond_resched(); // To be removed
	}

You get the idea, no?

>> Seriously, this crap preserving mindset has to stop. If we go new ways
>> then we go them all the way.
>
> It's not about "crap preserving" but more of taking smaller steps. Then we
> can see where a regression happened if one does come up. Kind of like how
> you did the x86 64bit/32bit merge. Do steps that keep things as close to
> what they were at the start and slowly move toward your goals.

Correct. But if you want to do small steps then you have to look at all
the usage sites first and prepare them _before_ introducing the new
model. That's what I have done for the past 20 years.

The comparison to the 32/64bit merge is completely bogus because that
was just the purely mechanical collocation of the files to make it easy
to consolidate them afterwards. The consolidation was the real effort.

If you want a proper example then look at the CPU hotplug cleanup. There
was a large pile of preparatory patches before we even started to
convert to the state machine concept.

Look at all the other things we've done in the past 20 years of
refactoring to make RT possible. They all follow the same scheme:

   1) Analyze _all_ usage sites, i.e. make an inventory

   2) Define a migration path, i.e. come up with proper abstractions

   3) Convert the usage sites over to the new abstractions

   4) Replace the mechanics in the new abstractions

I certainly tried to short curcuit in the early days, but I learned
instantaneously that the short circuit approach is doomed especially
when you deal with changing the most fundamental parts of an OS.

Your idea of taking smaller steps is fundamentally flawed as it fails
to look at the bigger picture first and just tries to emulate the status
quo without doing the preparatory steps upfront.

Peter's approach is perfectly fine because it provides an opportunity,
which allows to do the analysis (#1) not only by inspection, but also by
observation, without being disruptive.

That seems to be the more painful approach, but I can assure you it's
less painful than the 'emulate crap' just to make progress approach.

Why?

It forces people to actually analyze the problems and not work around
them with yet another magic duct tape solution which is equally ill
defined as cond_resched() or the hideous might_sleep() hack.

The goal is to reduce technical debt and not to replace it with a
different variant.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 23:16                   ` Thomas Gleixner
@ 2024-10-09 23:29                     ` Steven Rostedt
  2024-10-10  1:20                       ` Thomas Gleixner
  0 siblings, 1 reply; 57+ messages in thread
From: Steven Rostedt @ 2024-10-09 23:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Thu, 10 Oct 2024 01:16:27 +0200
Thomas Gleixner <tglx@linutronix.de> wrote:

> On Wed, Oct 09 2024 at 17:19, Steven Rostedt wrote:
> > On Wed, 09 Oct 2024 23:06:00 +0200
> > Thomas Gleixner <tglx@linutronix.de> wrote:  
> >> For the transition phase we obviously need to do:
> >> 
> >>     while ($cond) {
> >>           spin_lock(L);
> >>           do_stuff();
> >>           spin_unlock(L);
> >>           cond_resched();
> >>     }  
> >
> > But if $cond needs to be protected by spin_lock(), what then?
> >
> > 	spin_lock();
> > 	while ($cond) {
> > 		do_stuff();
> > 		spin_unlock();
> > 		spin_lock();
> > 	}
> > 	spin_unlock();
> >  
> 
> Seriously? The proper pattern for this is to do either:
> 
> 	while (READ_ONCE($cond)) {
> 		scoped_guard(spinlock)(L) {
>                 	if ($cond)
>                 		break;
> 			do_stuff();
>                 }
> 		cond_resched(); // To be removed
> 	}
> 
> or in case that $cond is more complex and needs lock protection:
> 
> 	while (true) {
> 		scoped_guard(spinlock)(L) {
>                 	if ($cond)
>                 		break;
> 			do_stuff();
>                 }
> 		cond_resched(); // To be removed
> 	}
> 
> You get the idea, no?

Basically still the same logic, just a different syntax. Speaking of which...

> 
> >> Seriously, this crap preserving mindset has to stop. If we go new ways
> >> then we go them all the way.  
> >
> > It's not about "crap preserving" but more of taking smaller steps. Then we
> > can see where a regression happened if one does come up. Kind of like how
> > you did the x86 64bit/32bit merge. Do steps that keep things as close to
> > what they were at the start and slowly move toward your goals.  
> 
> Correct. But if you want to do small steps then you have to look at all
> the usage sites first and prepare them _before_ introducing the new
> model. That's what I have done for the past 20 years.
> 
> The comparison to the 32/64bit merge is completely bogus because that
> was just the purely mechanical collocation of the files to make it easy
> to consolidate them afterwards. The consolidation was the real effort.
> 
> If you want a proper example then look at the CPU hotplug cleanup. There
> was a large pile of preparatory patches before we even started to
> convert to the state machine concept.
> 
> Look at all the other things we've done in the past 20 years of
> refactoring to make RT possible. They all follow the same scheme:
> 
>    1) Analyze _all_ usage sites, i.e. make an inventory
> 
>    2) Define a migration path, i.e. come up with proper abstractions
> 
>    3) Convert the usage sites over to the new abstractions
> 
>    4) Replace the mechanics in the new abstractions
> 
> I certainly tried to short curcuit in the early days, but I learned
> instantaneously that the short circuit approach is doomed especially
> when you deal with changing the most fundamental parts of an OS.
> 
> Your idea of taking smaller steps is fundamentally flawed as it fails
> to look at the bigger picture first and just tries to emulate the status
> quo without doing the preparatory steps upfront.
> 
> Peter's approach is perfectly fine because it provides an opportunity,
> which allows to do the analysis (#1) not only by inspection, but also by
> observation, without being disruptive.
> 
> That seems to be the more painful approach, but I can assure you it's
> less painful than the 'emulate crap' just to make progress approach.
> 
> Why?
> 
> It forces people to actually analyze the problems and not work around
> them with yet another magic duct tape solution which is equally ill
> defined as cond_resched() or the hideous might_sleep() hack.
> 
> The goal is to reduce technical debt and not to replace it with a
> different variant.
> 

The above definitely sounds like rational to switch everything over to Rust!

  /me runs!!!!

-- Steve

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 23:29                     ` Steven Rostedt
@ 2024-10-10  1:20                       ` Thomas Gleixner
  0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-10  1:20 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Wed, Oct 09 2024 at 19:29, Steven Rostedt wrote:
> On Thu, 10 Oct 2024 01:16:27 +0200
> Thomas Gleixner <tglx@linutronix.de> wrote:
>
>> On Wed, Oct 09 2024 at 17:19, Steven Rostedt wrote:
>> > On Wed, 09 Oct 2024 23:06:00 +0200
>> > Thomas Gleixner <tglx@linutronix.de> wrote:  
>> >> For the transition phase we obviously need to do:
>> >> 
>> >>     while ($cond) {
>> >>           spin_lock(L);
>> >>           do_stuff();
>> >>           spin_unlock(L);
>> >>           cond_resched();
>> >>     }  
>> >
>> > But if $cond needs to be protected by spin_lock(), what then?
>> >
>> > 	spin_lock();
>> > 	while ($cond) {
>> > 		do_stuff();
>> > 		spin_unlock();
>> > 		spin_lock();
>> > 	}
>> > 	spin_unlock();
>> >  
>> 
>> Seriously? The proper pattern for this is to do either:
>> 
>> 	while (READ_ONCE($cond)) {
>> 		scoped_guard(spinlock)(L) {
>>                 	if ($cond)
>>                 		break;
>> 			do_stuff();
>>                 }
>> 		cond_resched(); // To be removed
>> 	}
>> 
>> or in case that $cond is more complex and needs lock protection:
>> 
>> 	while (true) {
>> 		scoped_guard(spinlock)(L) {
>>                 	if ($cond)
>>                 		break;
>> 			do_stuff();
>>                 }
>> 		cond_resched(); // To be removed
>> 	}
>> 
>> You get the idea, no?
>
> Basically still the same logic, just a different syntax. Speaking of which...

Of course it's the same logic at the very end, but it's logic which is
understandable, makes sense and allows to change the underlying
infrastructure without disruption.

Refactoring is an art and if you get it wrong you actually make it
worse. Been there, done that.

>> It forces people to actually analyze the problems and not work around
>> them with yet another magic duct tape solution which is equally ill
>> defined as cond_resched() or the hideous might_sleep() hack.
>> 
>> The goal is to reduce technical debt and not to replace it with a
>> different variant.
>> 
>
> The above definitely sounds like rational to switch everything over to Rust!
>
>   /me runs!!!!

Why are you running away?

I'm all for moving the kernel to a less error prone programming
language. I've done ADA programming in my former professional live and
I'm well aware about the advantages of a "safe" programming language.

For me Rust is definitely the right way to go, aside of my personal
distaste of the syntax and the insanity of abandoning an existing
universe of ADA tools just because of 'not invented here', but that's a
different discussion to be had.

That said, converging to Rust requires a massive effort of being
disciplined on the C side in the first place. Otherwise the Rust
abstractions will just end up being a mostly useless wrapper around the
existing C insanities. Which in turn will neither reduce technical debt,
nor be able to hold up the promise to squash a whole class of bugs.

Quite the contrary it will accelerate the problems we already have
instead of reducing them, while everyone can pretend that we are so much
better off.

TBH, converging to Rust is a real chance to move away from the "features
first, correctness later" approach, but the way our community is
approaching it right now is just following the old scheme by making Rust
compatible to the "features first, correctness later" mantra.

If you still are telling me that:

>> > 	spin_lock();
>> > 	while ($cond) {
>> > 		do_stuff();
>> > 		spin_unlock();
>> > 		spin_lock();
>> > 	}
>> > 	spin_unlock();

is just different syntax than

>> 	while (true) {
>> 		scoped_guard(spinlock)(L) {
>>                 	if ($cond)
>>                 		break;
>> 			do_stuff();
>>             }
>> 	}

then please explain to me how you map your proposed scheme to a language
which is scope based by design.

Thanks,

        tglx


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 20:43             ` Steven Rostedt
  2024-10-09 21:06               ` Thomas Gleixner
@ 2024-10-10  3:12               ` Tianchen Ding
  2024-10-10  7:47                 ` Thomas Gleixner
  1 sibling, 1 reply; 57+ messages in thread
From: Tianchen Ding @ 2024-10-10  3:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault, Thomas Gleixner

On 2024/10/10 04:43, Steven Rostedt wrote:

[...]

> Hmm, but then again...
> 
> Perhaps these cond_resched() is proper? That is, the need_resched() /
> cond_resched() is not something that is being done for PREEMPT_NONE, but
> for preempt/voluntary kernels too. Maybe these cond_resched() should stay?
> If we spin in the loop for one more tick, that is actually changing the
> behavior of PREEMPT_NONE and PREEMPT_VOLUNTARY, as the need_resched()/cond_resched()
> helps with latency. If we just wait for the next tick, these loops (and
> there's a lot of them) will all now run for one tick longer than if
> PREEMPT_NONE or PREEMPT_VOLUNTARY were set today.
> 

Agree.

And for PREEMPT_LAZIEST, this becomes worse. The fair_class tasks will be 
delayed more than 1 tick. They may be starved until a non-fair class task comes 
to "save" them.

cond_resched() is designed for NONE/VOLUNTARY to avoid spinning in kernel and 
prevent softlockup. However, it is a nop in PREEMPT_LAZIEST, and things may be 
broken...

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT
  2024-10-07  7:46 ` [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT Peter Zijlstra
  2024-10-08 13:24   ` Sebastian Andrzej Siewior
@ 2024-10-10  6:52   ` Christoph Hellwig
  2024-10-10  7:50     ` Peter Zijlstra
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2 siblings, 1 reply; 57+ messages in thread
From: Christoph Hellwig @ 2024-10-10  6:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: bigeasy, tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Mon, Oct 07, 2024 at 09:46:12AM +0200, Peter Zijlstra wrote:
> In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT
> from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is
> not a change in how preemption works, but rather it makes a ton more
> code preemptible.
> 
> Notably, take away NONE and VOLATILE options for PREEMPT_RT, they make
> no sense (but are techincally possible).

Should we take away NONE entirely because it is kinda silly?

Just a bystander here, but the explosion of different preemption models
is a bit silly and not really helpful to users.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-10  3:12               ` Tianchen Ding
@ 2024-10-10  7:47                 ` Thomas Gleixner
  0 siblings, 0 replies; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-10  7:47 UTC (permalink / raw)
  To: Tianchen Ding, Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora, mingo,
	linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, vschneid, efault

On Thu, Oct 10 2024 at 11:12, Tianchen Ding wrote:
> On 2024/10/10 04:43, Steven Rostedt wrote:
>> Perhaps these cond_resched() is proper? That is, the need_resched() /
>> cond_resched() is not something that is being done for PREEMPT_NONE, but
>> for preempt/voluntary kernels too. Maybe these cond_resched() should stay?
>> If we spin in the loop for one more tick, that is actually changing the
>> behavior of PREEMPT_NONE and PREEMPT_VOLUNTARY, as the need_resched()/cond_resched()
>> helps with latency. If we just wait for the next tick, these loops (and
>> there's a lot of them) will all now run for one tick longer than if
>> PREEMPT_NONE or PREEMPT_VOLUNTARY were set today.
>> 
>
> Agree.
>
> And for PREEMPT_LAZIEST, this becomes worse. The fair_class tasks will be 
> delayed more than 1 tick. They may be starved until a non-fair class task comes 
> to "save" them.

Everybody agreed already that PREEMPT_LAZIEST is silly and not going to
happen. Nothing to see here.

> cond_resched() is designed for NONE/VOLUNTARY to avoid spinning in kernel and 
> prevent softlockup. However, it is a nop in PREEMPT_LAZIEST, and things may be 
> broken...

cond_resched() is not designed. It's an ill-defined bandaid and the
purpose of LAZY is to remove it completely along with the preemption
models which depend on it.

Thanks,

        tglx




^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT
  2024-10-10  6:52   ` Christoph Hellwig
@ 2024-10-10  7:50     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2024-10-10  7:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: bigeasy, tglx, mingo, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault

On Wed, Oct 09, 2024 at 11:52:36PM -0700, Christoph Hellwig wrote:
> On Mon, Oct 07, 2024 at 09:46:12AM +0200, Peter Zijlstra wrote:
> > In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT
> > from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is
> > not a change in how preemption works, but rather it makes a ton more
> > code preemptible.
> > 
> > Notably, take away NONE and VOLATILE options for PREEMPT_RT, they make
> > no sense (but are techincally possible).
> 
> Should we take away NONE entirely because it is kinda silly?
> 
> Just a bystander here, but the explosion of different preemption models
> is a bit silly and not really helpful to users.

The end goal is to get rid of Voluntary and None, but getting there will
require more work. A lot of careful work to be sure. In the end we
should be rid of those two preemption models and the cond_resched() and
much of the random need_resched() hackery they brought.

But we can't do it all at once, so yeah, we need to grow extra crap
before we can take out the old stuff. Nothing new there, but I get your
frustration.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-09 21:06               ` Thomas Gleixner
  2024-10-09 21:19                 ` Steven Rostedt
@ 2024-10-10 10:23                 ` David Laight
  2024-10-13 19:02                   ` Thomas Gleixner
  1 sibling, 1 reply; 57+ messages in thread
From: David Laight @ 2024-10-10 10:23 UTC (permalink / raw)
  To: 'Thomas Gleixner', Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora,
	mingo@kernel.org, linux-kernel@vger.kernel.org,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	dietmar.eggemann@arm.com, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, efault@gmx.de

...
> And once all the problems with LAZY are sorted then this cond_resched()
> line just goes away and the loop looks like this:
> 
>     while ($cond) {
>           spin_lock(L);
>           do_stuff();
>           spin_unlock(L);
>     }

The problem with that pattern is the cost of the atomics.
Thay can easily be significant especially if there are
a lot of iterations and do_stuff() is cheap;

If $cond needs the lock, the code is really:
	spin_lock(L);
	while ($cond) {
		do_stuff();
		spin_unlock(L);
		spin_lock(L);
	}
	spin_unlock(L);
which make it even more obvious that you need a cheap
test to optimise away the unlock/lock pair.

Perhaps it could be wrapped inside a spin_relax(L)?

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-10 10:23                 ` David Laight
@ 2024-10-13 19:02                   ` Thomas Gleixner
  2024-10-14  8:21                     ` David Laight
  0 siblings, 1 reply; 57+ messages in thread
From: Thomas Gleixner @ 2024-10-13 19:02 UTC (permalink / raw)
  To: David Laight, Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora,
	mingo@kernel.org, linux-kernel@vger.kernel.org,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	dietmar.eggemann@arm.com, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, efault@gmx.de

On Thu, Oct 10 2024 at 10:23, David Laight wrote:
> ...
>> And once all the problems with LAZY are sorted then this cond_resched()
>> line just goes away and the loop looks like this:
>> 
>>     while ($cond) {
>>           spin_lock(L);
>>           do_stuff();
>>           spin_unlock(L);
>>     }
>
> The problem with that pattern is the cost of the atomics.
> Thay can easily be significant especially if there are
> a lot of iterations and do_stuff() is cheap;
>
> If $cond needs the lock, the code is really:
> 	spin_lock(L);
> 	while ($cond) {
> 		do_stuff();
> 		spin_unlock(L);
> 		spin_lock(L);
> 	}
> 	spin_unlock(L);
>
> which make it even more obvious that you need a cheap
> test to optimise away the unlock/lock pair.

You cannot optimize the unlock/lock pair away for a large number of
iterations because then you bring back the problem of extended
latencies.

It does not matter whether $cond is cheap and do_stuff() is cheap. If
you have enough iterations then even a cheap do_stuff() causes massive
latencies, unless you keep the horrible cond_resched() mess, which we
are trying to remove.

What you are proposing is a programming antipattern and the lock/unlock
around do_stuff() in the clean loop I outlined is mostly free when there
is no contention, unless you use a pointless micro benchmark which has
an empty (or almost empty) do_stuff() implementation. We are not
optimizing for completely irrelevant theoretical nonsense.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-13 19:02                   ` Thomas Gleixner
@ 2024-10-14  8:21                     ` David Laight
  0 siblings, 0 replies; 57+ messages in thread
From: David Laight @ 2024-10-14  8:21 UTC (permalink / raw)
  To: 'Thomas Gleixner', Steven Rostedt
  Cc: Peter Zijlstra, Sebastian Andrzej Siewior, Ankur Arora,
	mingo@kernel.org, linux-kernel@vger.kernel.org,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	dietmar.eggemann@arm.com, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, efault@gmx.de

From: Thomas Gleixner
> Sent: 13 October 2024 20:02
> 
> On Thu, Oct 10 2024 at 10:23, David Laight wrote:
> > ...
> >> And once all the problems with LAZY are sorted then this cond_resched()
> >> line just goes away and the loop looks like this:
> >>
> >>     while ($cond) {
> >>           spin_lock(L);
> >>           do_stuff();
> >>           spin_unlock(L);
> >>     }
> >
> > The problem with that pattern is the cost of the atomics.
> > Thay can easily be significant especially if there are
> > a lot of iterations and do_stuff() is cheap;
> >
> > If $cond needs the lock, the code is really:
> > 	spin_lock(L);
> > 	while ($cond) {
> > 		do_stuff();
> > 		spin_unlock(L);
> > 		spin_lock(L);
> > 	}
> > 	spin_unlock(L);
> >
> > which make it even more obvious that you need a cheap
> > test to optimise away the unlock/lock pair.
> 
> You cannot optimize the unlock/lock pair away for a large number of
> iterations because then you bring back the problem of extended
> latencies.
> 
> It does not matter whether $cond is cheap and do_stuff() is cheap. If
> you have enough iterations then even a cheap do_stuff() causes massive
> latencies, unless you keep the horrible cond_resched() mess, which we
> are trying to remove.

While cond_resched() can probably go, you need a cheap need_resched()
so the loop above can contain:
		if (need_resched()) {
			spin_unlock(L);
			spin_lock(L);
		}
to avoid the atomics when both $cond and do_stuff() are cheap
but there are a lot of iterations.

There will also be cases where it isn't anywhere near as simple
as unlock/lock (eg traversing a linked list) because additional
code is needed to ensure the loop can be continued.

> What you are proposing is a programming antipattern and the lock/unlock
> around do_stuff() in the clean loop I outlined is mostly free when there
> is no contention, unless you use a pointless micro benchmark which has
> an empty (or almost empty) do_stuff() implementation. We are not
> optimizing for completely irrelevant theoretical nonsense.

Aren't you adding a extra pair of atomics on every iteration.
That is going to be noticeable.
Never mind the cases where it isn't that simple.

	David

> 
> Thanks,
> 
>         tglx
> 

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
  2024-10-08  5:43   ` Ankur Arora
  2024-10-09  8:50   ` Sebastian Andrzej Siewior
@ 2024-10-15 14:37   ` Shrikanth Hegde
  2024-10-25 10:42     ` Sebastian Andrzej Siewior
  2024-10-22 16:44   ` Shrikanth Hegde
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  4 siblings, 1 reply; 57+ messages in thread
From: Shrikanth Hegde @ 2024-10-15 14:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, vschneid, ankur.a.arora, efault,
	bigeasy, tglx, mingo



On 10/7/24 13:16, Peter Zijlstra wrote:
> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> 
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.

I didn't understand the math here. How?

> 
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
> 
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
> 
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/preempt.h |    8 ++++-
>   kernel/Kconfig.preempt  |   15 +++++++++
>   kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>   kernel/sched/debug.c    |    5 +--
>   kernel/sched/fair.c     |    6 +--
>   kernel/sched/sched.h    |    1
>   6 files changed, 103 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>   extern bool preempt_model_none(void);
>   extern bool preempt_model_voluntary(void);
>   extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>   
>   #else
>   
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>   	return IS_ENABLED(CONFIG_PREEMPT);
>   }
>   
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>   #endif
>   
>   static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>    */
>   static inline bool preempt_model_preemptible(void)
>   {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>   }
>   
>   #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>   	select PREEMPTION
>   	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>   
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>   choice
>   	prompt "Preemption Model"
>   	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>   	  embedded system with latency requirements in the milliseconds
>   	  range.
>   
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>   config PREEMPT_RT
>   	bool "Fully Preemptible Kernel (Real-Time)"
>   	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>   
>   	lockdep_assert_rq_held(rq);
>   
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +
>   	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>   		return;
>   
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>   	__resched_curr(rq, TIF_NEED_RESCHED);
>   }
>   
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);


This should be CONFIG_PREEMPT_LAZY no?

> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}
> +
> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
...

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
                   ` (8 preceding siblings ...)
  2024-10-09 11:07 ` Sebastian Andrzej Siewior
@ 2024-10-17 12:36 ` Mike Galbraith
  2024-11-07 17:21   ` Thomas Meyer
  9 siblings, 1 reply; 57+ messages in thread
From: Mike Galbraith @ 2024-10-17 12:36 UTC (permalink / raw)
  To: Peter Zijlstra, bigeasy, tglx, mingo
  Cc: linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, vschneid, ankur.a.arora

[-- Attachment #1: Type: text/plain, Size: 7069 bytes --]

On Mon, 2024-10-07 at 09:46 +0200, Peter Zijlstra wrote:
> Hi!
>
> During LPC Thomas reminded me that the lazy preemption stuff was not there yet.
>
> So here goes, robot says it builds, and I checked both a regular and PREEMPT_RT
> build boots and can change the mode.
>
> Please have a poke.

Lazy seems to be the least sleeper friendly, and voluntary the most
switch happy of the three models tested (a bit counter-intuitive).  Not
sure I should post these results, nobody will ever notice these deltas
in their desktop, but they are carefully gathered cold hard numbers, so
what the heck, deleting them takes only a moment.

All data collected against virgin master with only this patch set then
applied, ie there are zero deltas other than the lazy set and model
selection.  The load profiled is my favorite completely hands off mixed
compute vs chrome desktop load, 3 runs of lazy and full compared to 2
runs of voluntary, I called that good enough after looking, not being
willing to do any more of what I have done oodles of in the past days.

I found sum delay deltas interesting.. and sensible after pondering,
nothing being free, delaying preemption of wider bodied compute tasks
has to impact somewhere. Should desktop components become very busy,
they'll be doing the impacting.

Full per run summaries attached, high speed scroll version below.

desktop util 18.1% static voluntary - virgin source
desktop util 18.3% static voluntary - +lazy patches
desktop util 17.5% lazy             - ditto...
desktop util 17.0% lazy
desktop util 16.8% lazy
desktop util 17.8% full
desktop util 17.8% full
desktop util 17.8% full

Profile summary bottom lines, same order (ignore bottom line delay max, includes nice tasks)
 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
voluntary
  TOTAL:                |2281560.794 ms | 18514617 |                 |       60.836 ms |   1101651.105 ms |
  TOTAL:                |2280493.511 ms | 18679820 |                 |       49.278 ms |   1117078.210 ms |
lazy                                                                                   avg 1109364.657       1.000
  TOTAL:                |2302856.995 ms | 15141600 |                 |       68.303 ms |   1316229.527 ms |
  TOTAL:                |2299911.697 ms | 15979830 |                 |       29.361 ms |   1276128.457 ms |
  TOTAL:                |2298180.079 ms | 16665916 |                 |       34.918 ms |   1271137.849 ms |
full                                                                                   avg 1287831.944       1.160
  TOTAL:                |2279751.696 ms | 16628663 |                 |       36.929 ms |    977433.245 ms |
  TOTAL:                |2278306.187 ms | 15846728 |                 |       79.954 ms |    974772.318 ms |
  TOTAL:                |2282262.311 ms | 17152068 |                 |       28.004 ms |    990068.590 ms |
                                                                                       avg  980758.051        .884

Detail extract for the 3 top players:
voluntary
  massive_intr:(9)      |1869421.285 ms |  5752848 | avg:   0.056 ms | max:  23.502 ms | sum:324608.328 ms |
  massive_intr:(9)      |1864362.822 ms |  5606194 | avg:   0.059 ms | max:  20.343 ms | sum:328920.012 ms |
                                                                                       avg   326764.170      1.000
  dav1d-worker:(8)      | 155700.590 ms |  1091524 | avg:   0.300 ms | max:  25.215 ms | sum:327735.383 ms |
  dav1d-worker:(8)      | 160730.867 ms |  1123134 | avg:   0.299 ms | max:  26.783 ms | sum:335553.750 ms |
                                                                                       avg   331644.566      1.000
  X:2534                |  51690.650 ms |   223470 | avg:   0.122 ms | max:  10.367 ms | sum:27371.213 ms |
  X:2522                |  51355.479 ms |   232615 | avg:   0.121 ms | max:  14.223 ms | sum:28243.127 ms |
                                                                                       avg   27807.170       1.000
lazy
  massive_intr:(9)      |1900362.729 ms |  4443987 | avg:   0.066 ms | max:  23.997 ms | sum:294806.423 ms |
  massive_intr:(9)      |1909494.321 ms |  4624111 | avg:   0.061 ms | max:  20.215 ms | sum:282640.873 ms |
  massive_intr:(9)      |1913482.381 ms |  4896729 | avg:   0.057 ms | max:  19.473 ms | sum:277651.582 ms |
                                                                                       avg   285032.959       .872
  dav1d-worker:(8)      | 164649.511 ms |   849691 | avg:   0.531 ms | max:  21.537 ms | sum:451592.116 ms |
  dav1d-worker:(8)      | 151579.745 ms |   850584 | avg:   0.505 ms | max:  29.119 ms | sum:429540.789 ms |
  dav1d-worker:(8)      | 145472.762 ms |   860714 | avg:   0.489 ms | max:  22.005 ms | sum:420927.393 ms |
                                                                                       avg   434020.099      1.308
  X:2508                |  53119.214 ms |   317117 | avg:   0.135 ms | max:  11.995 ms | sum:42917.248 ms |
  X:2508                |  53047.311 ms |   311231 | avg:   0.137 ms | max:  17.974 ms | sum:42601.534 ms |
  X:2508                |  48692.053 ms |   333137 | avg:   0.129 ms | max:  13.263 ms | sum:43140.962 ms |
                                                                                       avg   42886.581       1.542
full
  massive_intr:(9)      |1875426.524 ms |  5559171 | avg:   0.057 ms | max:  19.817 ms | sum:315854.813 ms |
  massive_intr:(9)      |1873997.628 ms |  4945112 | avg:   0.064 ms | max:  18.035 ms | sum:315257.596 ms |
  massive_intr:(9)      |1876339.600 ms |  5569489 | avg:   0.057 ms | max:  20.473 ms | sum:317771.140 ms |
                                                                                       avg   316294.516       .967
  dav1d-worker:(8)      | 157496.413 ms |   914670 | avg:   0.333 ms | max:  24.006 ms | sum:304944.125 ms |
  dav1d-worker:(8)      | 161723.573 ms |   879090 | avg:   0.354 ms | max:  26.070 ms | sum:310806.145 ms |
  dav1d-worker:(8)      | 157653.936 ms |   929556 | avg:   0.331 ms | max:  21.664 ms | sum:307315.109 ms |
                                                                                       avg   307688.459       .927
  X:2502                |  52845.691 ms |   131220 | avg:   0.135 ms | max:  11.479 ms | sum:17656.654 ms |
  X:2502                |  47884.248 ms |   134000 | avg:   0.148 ms | max:  12.068 ms | sum:19768.297 ms |
  X:2502                |  51731.371 ms |   145167 | avg:   0.132 ms | max:  15.995 ms | sum:19180.803 ms |
                                                                                       avg   18868.584        .678




[-- Attachment #2: summaries.txt --]
[-- Type: text/plain, Size: 19365 bytes --]

key: + postfix to name means lazy patch set applied, +lazy meany dynamic with lazy default.
     No postfix or only + postfix means static voluntary (my chosen default).
     end of line postfix in parens = model during test run

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin --sort=runtime -S 15 -T (static voluntary)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1869421.285 ms |  5752848 | avg:   0.056 ms | max:  23.502 ms | sum:324608.328 ms |
  dav1d-worker:(8)      | 155700.590 ms |  1091524 | avg:   0.300 ms | max:  25.215 ms | sum:327735.383 ms |
  X:2534                |  51690.650 ms |   223470 | avg:   0.122 ms | max:  10.367 ms | sum:27371.213 ms |
  VizCompositorTh:4732  |  37826.030 ms |   176382 | avg:   0.183 ms | max:  11.527 ms | sum:32269.244 ms |
  ThreadPoolForeg:(50)  |  23708.799 ms |   162325 | avg:   0.276 ms | max:  22.754 ms | sum:44853.075 ms |
  kworker/u32:3-e:67    |  21922.222 ms |  3950152 | avg:   0.013 ms | max:  12.980 ms | sum:49901.452 ms |
  kwin_x11:2927         |  18956.612 ms |   197986 | avg:   0.089 ms | max:  14.978 ms | sum:17669.582 ms |
  chrome:(9)            |  18146.220 ms |   162581 | avg:   0.181 ms | max:  18.931 ms | sum:29448.275 ms |
  VideoFrameCompo:4740  |  16994.546 ms |   206270 | avg:   0.159 ms | max:  12.552 ms | sum:32785.727 ms |
  kworker/u32:2-e:66    |  12923.440 ms |  2337049 | avg:   0.013 ms | max:  12.206 ms | sum:29439.455 ms |
  kthreadd:(4)          |   8651.856 ms |  1545898 | avg:   0.013 ms | max:  13.244 ms | sum:20645.416 ms |
  Compositor:(2)        |   7434.109 ms |   121329 | avg:   0.154 ms | max:  10.758 ms | sum:18628.354 ms |
  Chrome_ChildIOT:(6)   |   7164.010 ms |   242003 | avg:   0.167 ms | max:  12.135 ms | sum:40383.425 ms |
  Media:4739            |   6188.846 ms |   100660 | avg:   0.173 ms | max:  16.322 ms | sum:17384.775 ms |
  kworker/u32:5+e:69    |   4557.938 ms |   812666 | avg:   0.012 ms | max:  11.581 ms | sum:10014.729 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2281560.794 ms | 18514617 |                 |       60.836 ms |   1101651.105 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 18.1%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+ --sort=runtime -S 15 -T (static voluntary)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1864362.822 ms |  5606194 | avg:   0.059 ms | max:  20.343 ms | sum:328920.012 ms |
  dav1d-worker:(8)      | 160730.867 ms |  1123134 | avg:   0.299 ms | max:  26.783 ms | sum:335553.750 ms |
  X:2522                |  51355.479 ms |   232615 | avg:   0.121 ms | max:  14.223 ms | sum:28243.127 ms |
  VizCompositorTh:4708  |  37705.826 ms |   175678 | avg:   0.186 ms | max:  12.171 ms | sum:32642.549 ms |
  kworker/u32:1-e:65    |  23712.990 ms |  4302933 | avg:   0.012 ms | max:  14.379 ms | sum:50073.553 ms |
  ThreadPoolForeg:(51)  |  23354.248 ms |   146424 | avg:   0.305 ms | max:  32.721 ms | sum:44677.022 ms |
  chrome:(9)            |  17611.783 ms |   144228 | avg:   0.199 ms | max:  22.482 ms | sum:28732.916 ms |
  VideoFrameCompo:4716  |  17198.595 ms |   183878 | avg:   0.178 ms | max:  15.286 ms | sum:32792.967 ms |
  kwin_x11:2923         |  17183.572 ms |   190690 | avg:   0.105 ms | max:  11.328 ms | sum:20051.466 ms |
  kworker/u32:3-e:67    |  16584.163 ms |  3004657 | avg:   0.012 ms | max:  12.297 ms | sum:36189.183 ms |
  kworker/u32:6+e:70    |   8787.851 ms |  1589820 | avg:   0.012 ms | max:   9.698 ms | sum:19232.965 ms |
  Chrome_ChildIOT:(7)   |   7652.436 ms |   240093 | avg:   0.172 ms | max:  15.259 ms | sum:41196.085 ms |
  Compositor:(2)        |   7504.380 ms |   110731 | avg:   0.176 ms | max:  15.048 ms | sum:19475.037 ms |
  Media:4715            |   6080.907 ms |    97641 | avg:   0.180 ms | max:  16.142 ms | sum:17589.459 ms |
  kworker/u32:4-e:68    |   3700.463 ms |   665402 | avg:   0.011 ms | max:   9.711 ms | sum: 7460.158 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2280493.511 ms | 18679820 |                 |       49.278 ms |   1117078.210 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 18.3%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+lazy --sort=runtime -S 15 -T (lazy)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1900362.729 ms |  4443987 | avg:   0.066 ms | max:  23.997 ms | sum:294806.423 ms |
  dav1d-worker:(8)      | 164649.511 ms |   849691 | avg:   0.531 ms | max:  21.537 ms | sum:451592.116 ms |
  X:2508                |  53119.214 ms |   317117 | avg:   0.135 ms | max:  11.995 ms | sum:42917.248 ms |
  VizCompositorTh:4721  |  36975.869 ms |   177119 | avg:   0.214 ms | max:  14.383 ms | sum:37942.002 ms |
  ThreadPoolForeg:(50)  |  21374.995 ms |   161261 | avg:   0.333 ms | max:  21.738 ms | sum:53733.127 ms |
  chrome:(9)            |  17705.822 ms |   148144 | avg:   0.212 ms | max:  13.684 ms | sum:31335.048 ms |
  kwin_x11:2923         |  16293.433 ms |   195305 | avg:   0.095 ms | max:  11.748 ms | sum:18627.124 ms |
  VideoFrameCompo:4729  |  15607.441 ms |   192906 | avg:   0.198 ms | max:  26.420 ms | sum:38273.853 ms |
  kworker/u32:2+e:67    |  14288.685 ms |  2697907 | avg:   0.016 ms | max:  11.137 ms | sum:42158.772 ms |
  kworker/u32:4-e:69    |  10809.304 ms |  2030937 | avg:   0.014 ms | max:  12.982 ms | sum:29118.478 ms |
  kworker/u32:0-e:11    |   9670.564 ms |  1804924 | avg:   0.015 ms | max:  12.053 ms | sum:27919.959 ms |
  Compositor:(2)        |   7720.155 ms |   117533 | avg:   0.227 ms | max:  13.540 ms | sum:26654.650 ms |
  Chrome_ChildIOT:(8)   |   7168.602 ms |   267822 | avg:   0.237 ms | max:  15.399 ms | sum:63353.885 ms |
  Media:4728            |   6501.220 ms |   111442 | avg:   0.278 ms | max:  15.456 ms | sum:31018.087 ms |
  kworker/u32:1-e:66    |   2912.893 ms |   542561 | avg:   0.014 ms | max:   8.670 ms | sum: 7829.852 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2302856.995 ms | 15141600 |                 |       68.303 ms |   1316229.527 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 17.5%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+lazy --sort=runtime -S 15 -T (lazy)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1909494.321 ms |  4624111 | avg:   0.061 ms | max:  20.215 ms | sum:282640.873 ms |
  dav1d-worker:(8)      | 151579.745 ms |   850584 | avg:   0.505 ms | max:  29.119 ms | sum:429540.789 ms |
  X:2508                |  53047.311 ms |   311231 | avg:   0.137 ms | max:  17.974 ms | sum:42601.534 ms |
  VizCompositorTh:6893  |  37686.973 ms |   170197 | avg:   0.216 ms | max:  11.311 ms | sum:36749.812 ms |
  ThreadPoolForeg:(44)  |  21710.844 ms |   146025 | avg:   0.362 ms | max:  28.152 ms | sum:52931.194 ms |
  chrome:(9)            |  16248.165 ms |   131728 | avg:   0.210 ms | max:  16.033 ms | sum:27695.200 ms |
  VideoFrameCompo:6900  |  15484.140 ms |   169752 | avg:   0.223 ms | max:  13.208 ms | sum:37797.691 ms |
  kwin_x11:2923         |  15121.315 ms |   197942 | avg:   0.100 ms | max:  11.027 ms | sum:19749.568 ms |
  kthreadd:(5)          |  13906.868 ms |  2631171 | avg:   0.013 ms | max:  10.195 ms | sum:34262.373 ms |
  kworker/u32:1:6943    |  10539.864 ms |  1973511 | avg:   0.015 ms | max:  11.001 ms | sum:28668.999 ms |
  kworker/u32:10-:443   |  10211.276 ms |  1901469 | avg:   0.014 ms | max:  10.742 ms | sum:26474.182 ms |
  Compositor:(2)        |   7353.627 ms |   117889 | avg:   0.215 ms | max:  23.917 ms | sum:25324.895 ms |
  Chrome_ChildIOT:(6)   |   7005.489 ms |   256715 | avg:   0.243 ms | max:  17.614 ms | sum:62365.700 ms |
  Media:6899            |   6223.856 ms |   107444 | avg:   0.287 ms | max:  13.516 ms | sum:30849.560 ms |
  kworker/u32:0+e:5784  |   4113.435 ms |   783320 | avg:   0.015 ms | max:  10.960 ms | sum:11400.850 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2299911.697 ms | 15979830 |                 |       29.361 ms |   1276128.457 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 17.0%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+lazy --sort=runtime -S 15 -T (lazy)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1913482.381 ms |  4896729 | avg:   0.057 ms | max:  19.473 ms | sum:277651.582 ms |
  dav1d-worker:(8)      | 145472.762 ms |   860714 | avg:   0.489 ms | max:  22.005 ms | sum:420927.393 ms |
  X:2508                |  48692.053 ms |   333137 | avg:   0.129 ms | max:  13.263 ms | sum:43140.962 ms |
  VizCompositorTh:7462  |  38247.597 ms |   190153 | avg:   0.194 ms | max:  15.145 ms | sum:36874.818 ms |
  ThreadPoolForeg:(48)  |  20907.127 ms |   154823 | avg:   0.337 ms | max:  22.605 ms | sum:52112.821 ms |
  chrome:(9)            |  17173.249 ms |   145326 | avg:   0.206 ms | max:  15.988 ms | sum:29981.372 ms |
  kwin_x11:2923         |  16655.041 ms |   206479 | avg:   0.096 ms | max:  20.335 ms | sum:19726.071 ms |
  kworker/u32:3-e:6967  |  16211.545 ms |  3036606 | avg:   0.013 ms | max:  14.750 ms | sum:39934.280 ms |
  VideoFrameCompo:7469  |  15487.829 ms |   183878 | avg:   0.205 ms | max:  11.149 ms | sum:37702.223 ms |
  kworker/u32:10+:443   |  12143.699 ms |  2296001 | avg:   0.015 ms | max:  12.346 ms | sum:35025.772 ms |
  kthreadd:(3)          |   8806.543 ms |  1663465 | avg:   0.013 ms | max:  10.732 ms | sum:22443.117 ms |
  Compositor:(2)        |   7433.129 ms |   118757 | avg:   0.215 ms | max:  13.616 ms | sum:25591.282 ms |
  Chrome_ChildIOT:(7)   |   7305.424 ms |   270553 | avg:   0.230 ms | max:  15.037 ms | sum:62314.818 ms |
  kworker/u32:0-e:5784  |   7032.254 ms |  1305707 | avg:   0.016 ms | max:  10.448 ms | sum:21170.660 ms |
  Media:7468            |   6246.502 ms |   114745 | avg:   0.269 ms | max:  14.205 ms | sum:30891.031 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2298180.079 ms | 16665916 |                 |       34.918 ms |   1271137.849 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 16.8%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+lazy --sort=runtime -S 15 -T (full)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1875426.524 ms |  5559171 | avg:   0.057 ms | max:  19.817 ms | sum:315854.813 ms |
  dav1d-worker:(8)      | 157496.413 ms |   914670 | avg:   0.333 ms | max:  24.006 ms | sum:304944.125 ms |
  X:2502                |  52845.691 ms |   131220 | avg:   0.135 ms | max:  11.479 ms | sum:17656.654 ms |
  VizCompositorTh:4696  |  38197.336 ms |   141737 | avg:   0.137 ms | max:  18.307 ms | sum:19416.930 ms |
  ThreadPoolForeg:(51)  |  23793.225 ms |   134975 | avg:   0.261 ms | max:  31.737 ms | sum:35236.105 ms |
  kworker/u32:0+e:11    |  17946.233 ms |  3215726 | avg:   0.013 ms | max:  10.694 ms | sum:43005.547 ms |
  chrome:(9)            |  17507.091 ms |   126012 | avg:   0.193 ms | max:  23.634 ms | sum:24345.823 ms |
  VideoFrameCompo:4704  |  16601.850 ms |   182901 | avg:   0.113 ms | max:  19.033 ms | sum:20589.840 ms |
  kwin_x11:2893         |  15846.601 ms |   127128 | avg:   0.062 ms | max:  12.846 ms | sum: 7940.111 ms |
  kworker/u32:1-e:66    |  12786.558 ms |  2287904 | avg:   0.014 ms | max:  11.131 ms | sum:31205.819 ms |
  kworker/u32:2-e:67    |  10965.214 ms |  1980760 | avg:   0.014 ms | max:   9.275 ms | sum:27031.219 ms |
  Compositor:(2)        |   7367.684 ms |   110725 | avg:   0.122 ms | max:  13.756 ms | sum:13491.791 ms |
  Chrome_ChildIOT:(6)   |   7104.071 ms |   201664 | avg:   0.147 ms | max:  12.163 ms | sum:29735.926 ms |
  Media:4703            |   6053.286 ms |    98687 | avg:   0.150 ms | max:  12.463 ms | sum:14815.330 ms |
  kworker/u32:9-e:441   |   3229.031 ms |   573689 | avg:   0.014 ms | max:   9.088 ms | sum: 7951.808 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2279751.696 ms | 16628663 |                 |       36.929 ms |    977433.245 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 17.8%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+lazy --sort=runtime -S 15 -T (full)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1873997.628 ms |  4945112 | avg:   0.064 ms | max:  18.035 ms | sum:315257.596 ms |
  dav1d-worker:(8)      | 161723.573 ms |   879090 | avg:   0.354 ms | max:  26.070 ms | sum:310806.145 ms |
  X:2502                |  47884.248 ms |   134000 | avg:   0.148 ms | max:  12.068 ms | sum:19768.297 ms |
  VizCompositorTh:5049  |  37631.978 ms |   134948 | avg:   0.139 ms | max:  20.005 ms | sum:18802.998 ms |
  ThreadPoolForeg:(48)  |  23020.803 ms |   116519 | avg:   0.291 ms | max:  29.007 ms | sum:33854.527 ms |
  chrome:(9)            |  17637.863 ms |   119032 | avg:   0.204 ms | max:  19.511 ms | sum:24251.886 ms |
  VideoFrameCompo:5056  |  17097.638 ms |   147969 | avg:   0.140 ms | max:  11.943 ms | sum:20646.996 ms |
  kwin_x11:2893         |  16276.749 ms |   129768 | avg:   0.083 ms | max:  12.346 ms | sum:10783.203 ms |
  kworker/u32:0+e:11    |  13930.578 ms |  2506472 | avg:   0.012 ms | max:  10.309 ms | sum:30183.580 ms |
  kworker/u32:1-e:66    |  12053.560 ms |  2168793 | avg:   0.012 ms | max:   9.961 ms | sum:26421.505 ms |
  kthreadd:(4)          |   8379.998 ms |  1491424 | avg:   0.014 ms | max:   9.089 ms | sum:20605.324 ms |
  Compositor:(2)        |   7525.028 ms |   103715 | avg:   0.136 ms | max:  13.257 ms | sum:14070.263 ms |
  Chrome_ChildIOT:(8)   |   7029.210 ms |   190254 | avg:   0.155 ms | max:  20.680 ms | sum:29474.689 ms |
  Media:5055            |   6172.148 ms |    90196 | avg:   0.166 ms | max:  11.774 ms | sum:15011.825 ms |
  kworker/u32:9-i:441   |   5586.635 ms |  1007957 | avg:   0.011 ms | max:  14.529 ms | sum:11402.402 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2278306.187 ms | 15846728 |                 |       79.954 ms |    974772.318 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 17.8%

perf sched lat -i perf.data.full.6.12.0.g6485cf5-master-virgin+lazy --sort=runtime -S 15 -T (full)

 ----------------------------------------------------------------------------------------------------------
  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
 ----------------------------------------------------------------------------------------------------------
  massive_intr:(9)      |1876339.600 ms |  5569489 | avg:   0.057 ms | max:  20.473 ms | sum:317771.140 ms |
  dav1d-worker:(8)      | 157653.936 ms |   929556 | avg:   0.331 ms | max:  21.664 ms | sum:307315.109 ms |
  X:2502                |  51731.371 ms |   145167 | avg:   0.132 ms | max:  15.995 ms | sum:19180.803 ms |
  VizCompositorTh:5879  |  37636.433 ms |   145822 | avg:   0.128 ms | max:  13.675 ms | sum:18675.236 ms |
  ThreadPoolForeg:(50)  |  22727.468 ms |   118237 | avg:   0.297 ms | max:  28.004 ms | sum:35105.904 ms |
  kworker/u32:1+e:66    |  18944.682 ms |  3398565 | avg:   0.013 ms | max:  12.287 ms | sum:44070.394 ms |
  chrome:(9)            |  17697.852 ms |   128113 | avg:   0.193 ms | max:  19.843 ms | sum:24676.480 ms |
  kthreadd:(3)          |  17129.315 ms |  3074837 | avg:   0.013 ms | max:  14.654 ms | sum:41107.807 ms |
  VideoFrameCompo:5886  |  16956.260 ms |   167159 | avg:   0.125 ms | max:  12.120 ms | sum:20955.174 ms |
  kwin_x11:2893         |  15744.805 ms |   128937 | avg:   0.076 ms | max:  15.872 ms | sum: 9811.977 ms |
  kworker/u32:3-e:5390  |   7597.088 ms |  1370365 | avg:   0.013 ms | max:   9.726 ms | sum:18185.745 ms |
  Compositor:(2)        |   7591.781 ms |   110688 | avg:   0.127 ms | max:  13.707 ms | sum:14065.114 ms |
  Chrome_ChildIOT:(6)   |   6916.739 ms |   198787 | avg:   0.149 ms | max:  12.443 ms | sum:29695.890 ms |
  Media:5885            |   6161.402 ms |    95843 | avg:   0.158 ms | max:  13.595 ms | sum:15189.381 ms |
  kworker/u32:0-e:11    |   3038.438 ms |   520012 | avg:   0.014 ms | max:   9.409 ms | sum: 7259.283 ms |
 ----------------------------------------------------------------------------------------------------------
  TOTAL:                |2282262.311 ms | 17152068 |                 |       28.004 ms |    990068.590 ms |
 ----------------------------------------------------------------------------------------------------------
desktop util 17.8%


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
                     ` (2 preceding siblings ...)
  2024-10-15 14:37   ` Shrikanth Hegde
@ 2024-10-22 16:44   ` Shrikanth Hegde
  2024-10-25 13:19     ` Sebastian Andrzej Siewior
  2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  4 siblings, 1 reply; 57+ messages in thread
From: Shrikanth Hegde @ 2024-10-22 16:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, vschneid, ankur.a.arora, efault,
	bigeasy, tglx, mingo



On 10/7/24 13:16, Peter Zijlstra wrote:
> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> 
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.
> 
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
> 
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
> 
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/preempt.h |    8 ++++-
>   kernel/Kconfig.preempt  |   15 +++++++++
>   kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>   kernel/sched/debug.c    |    5 +--
>   kernel/sched/fair.c     |    6 +--
>   kernel/sched/sched.h    |    1
>   6 files changed, 103 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>   extern bool preempt_model_none(void);
>   extern bool preempt_model_voluntary(void);
>   extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>   
>   #else
>   
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>   	return IS_ENABLED(CONFIG_PREEMPT);
>   }
>   
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>   #endif
>   
>   static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>    */
>   static inline bool preempt_model_preemptible(void)
>   {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>   }
>   
>   #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>   	select PREEMPTION
>   	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>   
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>   choice
>   	prompt "Preemption Model"
>   	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>   	  embedded system with latency requirements in the milliseconds
>   	  range.
>   
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>   config PREEMPT_RT
>   	bool "Fully Preemptible Kernel (Real-Time)"
>   	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>   
>   	lockdep_assert_rq_held(rq);
>   
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +
>   	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>   		return;
>   
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>   	__resched_curr(rq, TIF_NEED_RESCHED);
>   }
>   
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);

I had to make it CONFIG_PREEMPT_LAZY for lazy preemption to work
on systems where CONFIG_PREEMPT_DYNAMIC=n.

> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}
> +
> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
>   void resched_cpu(int cpu)
>   {
>   	struct rq *rq = cpu_rq(cpu);
> @@ -5598,6 +5627,10 @@ void sched_tick(void)
>   	update_rq_clock(rq);
>   	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>   	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> +
> +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> +		resched_curr(rq);
> +
>   	curr->sched_class->task_tick(rq, curr, 0);
>   	if (sched_feat(LATENCY_WARN))
>   		resched_latency = cpu_resched_latency(rq);
> @@ -7334,6 +7367,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- NOP
>    *   preempt_schedule_notrace   <- NOP
>    *   irqentry_exit_cond_resched <- NOP
> + *   dynamic_preempt_lazy       <- false
>    *
>    * VOLUNTARY:
>    *   cond_resched               <- __cond_resched
> @@ -7341,6 +7375,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- NOP
>    *   preempt_schedule_notrace   <- NOP
>    *   irqentry_exit_cond_resched <- NOP
> + *   dynamic_preempt_lazy       <- false
>    *
>    * FULL:
>    *   cond_resched               <- RET0
> @@ -7348,6 +7383,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- preempt_schedule
>    *   preempt_schedule_notrace   <- preempt_schedule_notrace
>    *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- false
> + *
> + * LAZY:
> + *   cond_resched               <- RET0
> + *   might_resched              <- RET0
> + *   preempt_schedule           <- preempt_schedule
> + *   preempt_schedule_notrace   <- preempt_schedule_notrace
> + *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- true
>    */
>   
>   enum {
> @@ -7355,6 +7399,7 @@ enum {
>   	preempt_dynamic_none,
>   	preempt_dynamic_voluntary,
>   	preempt_dynamic_full,
> +	preempt_dynamic_lazy,
>   };
>   
>   int preempt_dynamic_mode = preempt_dynamic_undefined;
> @@ -7370,15 +7415,23 @@ int sched_dynamic_mode(const char *str)
>   	if (!strcmp(str, "full"))
>   		return preempt_dynamic_full;
>   
> +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
> +	if (!strcmp(str, "lazy"))
> +		return preempt_dynamic_lazy;
> +#endif
> +
>   	return -EINVAL;
>   }
>   
> +#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
> +
>   #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
>   #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
>   #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
>   #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
> -#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
> -#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
> +#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
>   #else
>   #error "Unsupported PREEMPT_DYNAMIC mechanism"
>   #endif
> @@ -7398,6 +7451,7 @@ static void __sched_dynamic_update(int m
>   	preempt_dynamic_enable(preempt_schedule);
>   	preempt_dynamic_enable(preempt_schedule_notrace);
>   	preempt_dynamic_enable(irqentry_exit_cond_resched);
> +	preempt_dynamic_key_disable(preempt_lazy);
>   
>   	switch (mode) {
>   	case preempt_dynamic_none:
> @@ -7407,6 +7461,7 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_disable(preempt_schedule);
>   		preempt_dynamic_disable(preempt_schedule_notrace);
>   		preempt_dynamic_disable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: none\n");
>   		break;
> @@ -7418,6 +7473,7 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_disable(preempt_schedule);
>   		preempt_dynamic_disable(preempt_schedule_notrace);
>   		preempt_dynamic_disable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: voluntary\n");
>   		break;
> @@ -7429,9 +7485,22 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_enable(preempt_schedule);
>   		preempt_dynamic_enable(preempt_schedule_notrace);
>   		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: full\n");
>   		break;
> +
> +	case preempt_dynamic_lazy:
> +		if (!klp_override)
> +			preempt_dynamic_disable(cond_resched);
> +		preempt_dynamic_disable(might_resched);
> +		preempt_dynamic_enable(preempt_schedule);
> +		preempt_dynamic_enable(preempt_schedule_notrace);
> +		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_enable(preempt_lazy);
> +		if (mode != preempt_dynamic_mode)
> +			pr_info("Dynamic Preempt: lazy\n");
> +		break;
>   	}
>   
>   	preempt_dynamic_mode = mode;
> @@ -7494,6 +7563,8 @@ static void __init preempt_dynamic_init(
>   			sched_dynamic_update(preempt_dynamic_none);
>   		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
>   			sched_dynamic_update(preempt_dynamic_voluntary);
> +		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
> +			sched_dynamic_update(preempt_dynamic_lazy);
>   		} else {
>   			/* Default static call setting, nothing to do */
>   			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
> @@ -7514,6 +7585,7 @@ static void __init preempt_dynamic_init(
>   PREEMPT_MODEL_ACCESSOR(none);
>   PREEMPT_MODEL_ACCESSOR(voluntary);
>   PREEMPT_MODEL_ACCESSOR(full);
> +PREEMPT_MODEL_ACCESSOR(lazy);
>   
>   #else /* !CONFIG_PREEMPT_DYNAMIC: */
>   
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struc
>   static int sched_dynamic_show(struct seq_file *m, void *v)
>   {
>   	static const char * preempt_modes[] = {
> -		"none", "voluntary", "full"
> +		"none", "voluntary", "full", "lazy",
>   	};
> +	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
>   	int i;
>   
> -	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
> +	for (i = 0; i < j; i++) {
>   		if (preempt_dynamic_mode == i)
>   			seq_puts(m, "(");
>   		seq_puts(m, preempt_modes[i]);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
>   		return;
>   
>   	if (resched || did_preempt_short(cfs_rq, curr)) {



If there is a long running task, only after it is not eligible, LAZY would be set and
subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
that CPU.

If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
Initially I was thinking, if there is a long running process, then LAZY would be set after
one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
to be set, and On subsequent tick NR would be set.

That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?

> -		resched_curr(rq);
> +		resched_curr_lazy(rq);
>   		clear_buddies(cfs_rq, curr);
>   	}
>   }
> @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
>   	 * validating it and just reschedule.
>   	 */
>   	if (queued) {

What's this queued used for? hrtick seems to set it. I haven't understood how it works.

> -		resched_curr(rq_of(cfs_rq));
> +		resched_curr_lazy(rq_of(cfs_rq));
>   		return;
>   	}
>   	/*
> @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
>   	return;
>   
>   preempt:
> -	resched_curr(rq);

Is it better to call resched_curr here? When the code arrives here, it wants to
run pse as soon as possible right?

> +	resched_curr_lazy(rq);
>   }
>   
>   static struct task_struct *pick_task_fair(struct rq *rq)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2692,6 +2692,7 @@ extern void init_sched_rt_class(void);
>   extern void init_sched_fair_class(void);
>   
>   extern void resched_curr(struct rq *rq);
> +extern void resched_curr_lazy(struct rq *rq);
>   extern void resched_cpu(int cpu);
>   
>   extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
> 
> 
> 


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-15 14:37   ` Shrikanth Hegde
@ 2024-10-25 10:42     ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-25 10:42 UTC (permalink / raw)
  To: Shrikanth Hegde
  Cc: Peter Zijlstra, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault, tglx, mingo

On 2024-10-15 20:07:26 [+0530], Shrikanth Hegde wrote:
> 
> 
> On 10/7/24 13:16, Peter Zijlstra wrote:
> > Change fair to use resched_curr_lazy(), which, when the lazy
> > preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> > 
> > This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> > As such, the average delay between setting LAZY and actually
> > rescheduling will be TICK_NSEC/2.
> 
> I didn't understand the math here. How?

If you set the LAZY bit you wait until sched_tick() which fires and this
happens every TICK_NSEC. In extreme case the timer fires either
immediately (right after setting the bit) or after TICK_NSEC (because it
just fired so it takes another TICK_NSEC). Given those two, assuming the
average would be in the middle.

> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
> >   	__resched_curr(rq, TIF_NEED_RESCHED);
> >   }
> > +#ifdef CONFIG_PREEMPT_DYNAMIC
> > +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> > +static __always_inline bool dynamic_preempt_lazy(void)
> > +{
> > +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> > +}
> > +#else
> > +static __always_inline bool dynamic_preempt_lazy(void)
> > +{
> > +	return IS_ENABLED(PREEMPT_LAZY);
> 
> 
> This should be CONFIG_PREEMPT_LAZY no?

Correct.

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-22 16:44   ` Shrikanth Hegde
@ 2024-10-25 13:19     ` Sebastian Andrzej Siewior
  2024-10-29 18:57       ` Shrikanth Hegde
  0 siblings, 1 reply; 57+ messages in thread
From: Sebastian Andrzej Siewior @ 2024-10-25 13:19 UTC (permalink / raw)
  To: Shrikanth Hegde
  Cc: Peter Zijlstra, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault, tglx, mingo

On 2024-10-22 22:14:41 [+0530], Shrikanth Hegde wrote:
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
> >   		return;
> >   	if (resched || did_preempt_short(cfs_rq, curr)) {
> 
> 
> 
> If there is a long running task, only after it is not eligible, LAZY would be set and
> subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
> value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
> that CPU.
> 
> If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
> set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
> Initially I was thinking, if there is a long running process, then LAZY would be set after
> one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
> to be set, and On subsequent tick NR would be set.
> 
> That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?

I guess so. Once the slice is up then the NEED_RESCHED bit is replaced
with the LAZY bit. That means a return-to-userland (from a syscall) or
the following tick will lead to a scheduling event.

> > -		resched_curr(rq);
> > +		resched_curr_lazy(rq);
> >   		clear_buddies(cfs_rq, curr);
> >   	}
> >   }
> > @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
> >   	 * validating it and just reschedule.
> >   	 */
> >   	if (queued) {
> 
> What's this queued used for? hrtick seems to set it. I haven't understood how it works.

from 20241009074631.GH17263@noisy.programming.kicks-ass.net:
| hrtick is disabled by default (because expensive) and so it doesn't
| matter much, but it's purpose is to increase accuracy and hence I left
| it untouched for now.

This setups a hrtimer for the (remaining) time slice and invokes the
task_tick from there (instead of the regular tick).

> > -		resched_curr(rq_of(cfs_rq));
> > +		resched_curr_lazy(rq_of(cfs_rq));
> >   		return;
> >   	}
> >   	/*
> > @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
> >   	return;
> >   preempt:
> > -	resched_curr(rq);
> 
> Is it better to call resched_curr here? When the code arrives here, it wants to
> run pse as soon as possible right?

But wouldn't then every try_to_wakeup()/ wake_up() result in immediate
preemption? Letting it run and waiting to give up on its own, having the
preemption on return to userland results usually in better performance.
At least this is what I observed while playing with this.

> > +	resched_curr_lazy(rq);
> >   }
> >   static struct task_struct *pick_task_fair(struct rq *rq)

Sebastian

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/5] sched: Add Lazy preemption model
  2024-10-25 13:19     ` Sebastian Andrzej Siewior
@ 2024-10-29 18:57       ` Shrikanth Hegde
  0 siblings, 0 replies; 57+ messages in thread
From: Shrikanth Hegde @ 2024-10-29 18:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, linux-kernel, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vschneid,
	ankur.a.arora, efault, tglx, mingo


Hi Sebastian.

On 10/25/24 18:49, Sebastian Andrzej Siewior wrote:
> On 2024-10-22 22:14:41 [+0530], Shrikanth Hegde wrote:
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
>>>    		return;
>>>    	if (resched || did_preempt_short(cfs_rq, curr)) {
>>
>>
>>
>> If there is a long running task, only after it is not eligible, LAZY would be set and
>> subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
>> value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
>> that CPU.
>>
>> If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
>> set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
>> Initially I was thinking, if there is a long running process, then LAZY would be set after
>> one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
>> to be set, and On subsequent tick NR would be set.
>>
>> That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?
> 
> I guess so. Once the slice is up then the NEED_RESCHED bit is replaced
> with the LAZY bit. That means a return-to-userland (from a syscall) or
> the following tick will lead to a scheduling event.

ok.

> 
>>> -		resched_curr(rq);
>>> +		resched_curr_lazy(rq);
>>>    		clear_buddies(cfs_rq, curr);
>>>    	}
>>>    }
>>> @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
>>>    	 * validating it and just reschedule.
>>>    	 */
>>>    	if (queued) {
>>
>> What's this queued used for? hrtick seems to set it. I haven't understood how it works.
> 
> from 20241009074631.GH17263@noisy.programming.kicks-ass.net:
> | hrtick is disabled by default (because expensive) and so it doesn't
> | matter much, but it's purpose is to increase accuracy and hence I left
> | it untouched for now.
> 
> This setups a hrtimer for the (remaining) time slice and invokes the
> task_tick from there (instead of the regular tick).

thanks. will take a look and try to understand.

> 
>>> -		resched_curr(rq_of(cfs_rq));
>>> +		resched_curr_lazy(rq_of(cfs_rq));
>>>    		return;
>>>    	}
>>>    	/*
>>> @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
>>>    	return;
>>>    preempt:
>>> -	resched_curr(rq);
>>
>> Is it better to call resched_curr here? When the code arrives here, it wants to
>> run pse as soon as possible right?
> 
> But wouldn't then every try_to_wakeup()/ wake_up() result in immediate
> preemption? Letting it run and waiting to give up on its own, having the
> preemption on return to userland results usually in better performance.
> At least this is what I observed while playing with this.
> 

yes. I agree that preemption at every ttwu is bad. But that may not 
happen with latest code. i.e if RUN_TO_PARITY is enabled or pick_eevdf 
doesn't pick the waiting task as the best candidate.

My concern was also this code in check_preempt_wakeup_fair
         /*
          * Preempt an idle entity in favor of a non-idle entity (and 
don't preempt
          * in the inverse case).
          */
         if (cse_is_idle && !pse_is_idle)
                 goto preempt;
         if (cse_is_idle != pse_is_idle)
                 return;

If the current is idle and waking is not idle, we should set NR instead 
of LAZY is what I was thinking. Not sure if there is such pattern that 
happen in exit to kernel path, since exit to user is taken care by 
setting LAZY bit.


>>> +	resched_curr_lazy(rq);
>>>    }
>>>    static struct task_struct *pick_task_fair(struct rq *rq)
> 
> Sebastian


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [tip: sched/core] sched, x86: Enable Lazy preemption
  2024-10-07  7:46 ` [PATCH 4/5] sched, x86: Enable Lazy preemption Peter Zijlstra
@ 2024-11-06 10:48   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-11-06 10:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Sebastian Andrzej Siewior, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     476e8583ca16eecec0a3a28b6ee7130f4e369389
Gitweb:        https://git.kernel.org/tip/476e8583ca16eecec0a3a28b6ee7130f4e369389
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 04 Oct 2024 14:46:54 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Nov 2024 12:55:38 +01:00

sched, x86: Enable Lazy preemption

Add the TIF bit and select the Kconfig symbol to make it go.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.555778919@infradead.org
---
 arch/x86/Kconfig                   | 1 +
 arch/x86/include/asm/thread_info.h | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2852fcd..b76aa7f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PREEMPT_LAZY
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 12da7df..75bb390 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,8 +87,9 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
-#define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
-#define TIF_SSBD		5	/* Speculative store bypass disable */
+#define TIF_NEED_RESCHED_LAZY	4	/* rescheduling necessary */
+#define TIF_SINGLESTEP		5	/* reenable singlestep on user return*/
+#define TIF_SSBD		6	/* Speculative store bypass disable */
 #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
 #define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
@@ -110,6 +111,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_SSBD		(1 << TIF_SSBD)
 #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [tip: sched/core] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT
  2024-10-07  7:46 ` [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT Peter Zijlstra
  2024-10-08 13:24   ` Sebastian Andrzej Siewior
  2024-10-10  6:52   ` Christoph Hellwig
@ 2024-11-06 10:48   ` tip-bot2 for Peter Zijlstra
  2 siblings, 0 replies; 57+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-11-06 10:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Sebastian Andrzej Siewior, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     35772d627b55cc7fb4f33bae57c564a25b3121a9
Gitweb:        https://git.kernel.org/tip/35772d627b55cc7fb4f33bae57c564a25b3121a9
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 04 Oct 2024 14:46:56 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Nov 2024 12:55:38 +01:00

sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT

In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT
from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is
not a change in how preemption works, but rather it makes a ton more
code preemptible.

Notably, take away NONE and VOLUNTARY options for PREEMPT_RT, they make
no sense (but are techincally possible).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.441622332@infradead.org
---
 kernel/Kconfig.preempt | 12 +++++++-----
 kernel/sched/core.c    |  2 ++
 kernel/sched/debug.c   |  4 ++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 09f06d8..7c1b29a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -20,6 +20,7 @@ choice
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
+	depends on !PREEMPT_RT
 	select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This is the traditional Linux preemption model, geared towards
@@ -35,6 +36,7 @@ config PREEMPT_NONE
 config PREEMPT_VOLUNTARY
 	bool "Voluntary Kernel Preemption (Desktop)"
 	depends on !ARCH_NO_PREEMPT
+	depends on !PREEMPT_RT
 	select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This option reduces the latency of the kernel by adding more
@@ -54,7 +56,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	depends on !ARCH_NO_PREEMPT
-	select PREEMPT_BUILD
+	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
@@ -74,7 +76,7 @@ config PREEMPT_LAZY
 	bool "Scheduler controlled preemption model"
 	depends on !ARCH_NO_PREEMPT
 	depends on ARCH_HAS_PREEMPT_LAZY
-	select PREEMPT_BUILD
+	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
 	help
 	  This option provides a scheduler driven preemption model that
 	  is fundamentally similar to full preemption, but is less
@@ -82,6 +84,8 @@ config PREEMPT_LAZY
 	  reduce lock holder preemption and recover some of the performance
 	  gains seen from using Voluntary preemption.
 
+endchoice
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
@@ -99,8 +103,6 @@ config PREEMPT_RT
 	  Select this if you are building a kernel for systems which
 	  require real-time guarantees.
 
-endchoice
-
 config PREEMPT_COUNT
        bool
 
@@ -110,7 +112,7 @@ config PREEMPTION
 
 config PREEMPT_DYNAMIC
 	bool "Preemption behaviour defined on boot"
-	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+	depends on HAVE_PREEMPT_DYNAMIC
 	select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
 	select PREEMPT_BUILD
 	default y if HAVE_PREEMPT_DYNAMIC_CALL
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df6a34d..5c47d70 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7450,11 +7450,13 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
 
 int sched_dynamic_mode(const char *str)
 {
+#ifndef CONFIG_PREEMPT_RT
 	if (!strcmp(str, "none"))
 		return preempt_dynamic_none;
 
 	if (!strcmp(str, "voluntary"))
 		return preempt_dynamic_voluntary;
+#endif
 
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 44a49f9..a48b2a7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -248,9 +248,9 @@ static int sched_dynamic_show(struct seq_file *m, void *v)
 		"none", "voluntary", "full", "lazy",
 	};
 	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
-	int i;
+	int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
 
-	for (i = 0; i < j; i++) {
+	for (; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [tip: sched/core] sched: Add TIF_NEED_RESCHED_LAZY infrastructure
  2024-10-07  7:46 ` [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Peter Zijlstra
  2024-10-09 12:18   ` Sebastian Andrzej Siewior
@ 2024-11-06 10:48   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 57+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-11-06 10:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel),
	Sebastian Andrzej Siewior, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     26baa1f1c4bdc34b8d698c1900b407d863ad0e69
Gitweb:        https://git.kernel.org/tip/26baa1f1c4bdc34b8d698c1900b407d863ad0e69
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 04 Oct 2024 14:47:02 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Nov 2024 12:55:37 +01:00

sched: Add TIF_NEED_RESCHED_LAZY infrastructure

Add the basic infrastructure to split the TIF_NEED_RESCHED bit in two.
Either bit will cause a resched on return-to-user, but only
TIF_NEED_RESCHED will drive IRQ preemption.

No behavioural change intended.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.219540785@infradead.org
---
 include/linux/entry-common.h |  3 ++-
 include/linux/entry-kvm.h    |  5 +++--
 include/linux/sched.h        |  3 ++-
 include/linux/thread_info.h  | 21 +++++++++++++++++----
 kernel/entry/common.c        |  2 +-
 kernel/entry/kvm.c           |  4 ++--
 kernel/sched/core.c          | 34 +++++++++++++++++++++-------------
 7 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1e50cdb..fc61d02 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -64,7 +64,8 @@
 
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
-	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
+	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 6813171..16149f6 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -17,8 +17,9 @@
 #endif
 
 #define XFER_TO_GUEST_MODE_WORK						\
-	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL |	\
-	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
+	 _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME |			\
+	 ARCH_XFER_TO_GUEST_MODE_WORK)
 
 struct kvm_vcpu;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a76e3d0..1d5cc3e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk)
 
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
 {
-	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+	atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+			   (atomic_long_t *)&task_thread_info(tsk)->flags);
 }
 
 static inline int test_tsk_need_resched(struct task_struct *tsk)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28..cf2446c 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@ enum syscall_work_bit {
 
 #include <asm/thread_info.h>
 
+#ifndef TIF_NEED_RESCHED_LAZY
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+#error Inconsistent PREEMPT_LAZY
+#endif
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
@@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
 
 #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
 
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
 {
-	return arch_test_bit(TIF_NEED_RESCHED,
+	return arch_test_bit(bit,
 			     (unsigned long *)(&current_thread_info()->flags));
 }
 
 #else
 
-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
 {
-	return test_bit(TIF_NEED_RESCHED,
+	return test_bit(bit,
 			(unsigned long *)(&current_thread_info()->flags));
 }
 
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
 
+static __always_inline bool tif_need_resched(void)
+{
+	return tif_test_bit(TIF_NEED_RESCHED);
+}
+
 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
 static inline int arch_within_stack_frames(const void * const stack,
 					   const void * const stackend,
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 5b6934e..e33691d 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
 			schedule();
 
 		if (ti_work & _TIF_UPROBE)
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 2e0f75b..8485f63 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 			return -EINTR;
 		}
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
 			schedule();
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 			return ret;
 
 		ti_work = read_thread_flags();
-	} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+	} while (ti_work & XFER_TO_GUEST_MODE_WORK);
 	return 0;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aad4885..0cd05e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq)
  * this avoids any races wrt polling state changes and thereby avoids
  * spurious IPIs.
  */
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 {
-	struct thread_info *ti = task_thread_info(p);
-	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+	return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
 }
 
 /*
@@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p)
 }
 
 #else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 {
-	set_tsk_need_resched(p);
+	set_ti_thread_flag(ti, tif);
 	return true;
 }
 
@@ -1076,28 +1075,37 @@ void wake_up_q(struct wake_q_head *head)
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
 {
 	struct task_struct *curr = rq->curr;
+	struct thread_info *cti = task_thread_info(curr);
 	int cpu;
 
 	lockdep_assert_rq_held(rq);
 
-	if (test_tsk_need_resched(curr))
+	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
 	cpu = cpu_of(rq);
 
 	if (cpu == smp_processor_id()) {
-		set_tsk_need_resched(curr);
-		set_preempt_need_resched();
+		set_ti_thread_flag(cti, tif);
+		if (tif == TIF_NEED_RESCHED)
+			set_preempt_need_resched();
 		return;
 	}
 
-	if (set_nr_and_not_polling(curr))
-		smp_send_reschedule(cpu);
-	else
+	if (set_nr_and_not_polling(cti, tif)) {
+		if (tif == TIF_NEED_RESCHED)
+			smp_send_reschedule(cpu);
+	} else {
 		trace_sched_wake_idle_without_ipi(cpu);
+	}
+}
+
+void resched_curr(struct rq *rq)
+{
+	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
 void resched_cpu(int cpu)
@@ -1192,7 +1200,7 @@ static void wake_up_idle_cpu(int cpu)
 	 * and testing of the above solutions didn't appear to report
 	 * much benefits.
 	 */
-	if (set_nr_and_not_polling(rq->idle))
+	if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
 		smp_send_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [tip: sched/core] sched: Add Lazy preemption model
  2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
                     ` (3 preceding siblings ...)
  2024-10-22 16:44   ` Shrikanth Hegde
@ 2024-11-06 10:48   ` tip-bot2 for Peter Zijlstra
  4 siblings, 0 replies; 57+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2024-11-06 10:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel),
	Sebastian Andrzej Siewior, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     7c70cb94d29cd325fabe4a818c18613e3b9919a1
Gitweb:        https://git.kernel.org/tip/7c70cb94d29cd325fabe4a818c18613e3b9919a1
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 04 Oct 2024 14:46:58 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Nov 2024 12:55:38 +01:00

sched: Add Lazy preemption model

Change fair to use resched_curr_lazy(), which, when the lazy
preemption model is selected, will set TIF_NEED_RESCHED_LAZY.

This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
As such, the average delay between setting LAZY and actually
rescheduling will be TICK_NSEC/2.

In short, Lazy preemption will delay preemption for fair class but
will function as Full preemption for all the other classes, most
notably the realtime (RR/FIFO/DEADLINE) classes.

The goal is to bridge the performance gap with Voluntary, such that we
might eventually remove that option entirely.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20241007075055.331243614@infradead.org
---
 include/linux/preempt.h |  8 +++-
 kernel/Kconfig.preempt  | 15 ++++++++-
 kernel/sched/core.c     | 80 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c    |  5 +--
 kernel/sched/fair.c     |  6 +--
 kernel/sched/sched.h    |  1 +-
 6 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index ce76f1a..ca86235 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
 extern bool preempt_model_none(void);
 extern bool preempt_model_voluntary(void);
 extern bool preempt_model_full(void);
+extern bool preempt_model_lazy(void);
 
 #else
 
@@ -502,6 +503,11 @@ static inline bool preempt_model_full(void)
 	return IS_ENABLED(CONFIG_PREEMPT);
 }
 
+static inline bool preempt_model_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+
 #endif
 
 static inline bool preempt_model_rt(void)
@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void)
  */
 static inline bool preempt_model_preemptible(void)
 {
-	return preempt_model_full() || preempt_model_rt();
+	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
 }
 
 #endif /* __LINUX_PREEMPT_H */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index fe782cd..09f06d8 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
 	select PREEMPTION
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 
+config ARCH_HAS_PREEMPT_LAZY
+	bool
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
@@ -67,6 +70,18 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_LAZY
+	bool "Scheduler controlled preemption model"
+	depends on !ARCH_NO_PREEMPT
+	depends on ARCH_HAS_PREEMPT_LAZY
+	select PREEMPT_BUILD
+	help
+	  This option provides a scheduler driven preemption model that
+	  is fundamentally similar to full preemption, but is less
+	  eager to preempt SCHED_NORMAL tasks in an attempt to
+	  reduce lock holder preemption and recover some of the performance
+	  gains seen from using Voluntary preemption.
+
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
 	depends on EXPERT && ARCH_SUPPORTS_RT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0cd05e3..df6a34d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1083,6 +1083,13 @@ static void __resched_curr(struct rq *rq, int tif)
 
 	lockdep_assert_rq_held(rq);
 
+	/*
+	 * Always immediately preempt the idle task; no point in delaying doing
+	 * actual work.
+	 */
+	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+		tif = TIF_NEED_RESCHED;
+
 	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
 		return;
 
@@ -1108,6 +1115,32 @@ void resched_curr(struct rq *rq)
 	__resched_curr(rq, TIF_NEED_RESCHED);
 }
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+	if (dynamic_preempt_lazy())
+		return TIF_NEED_RESCHED_LAZY;
+
+	return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	__resched_curr(rq, get_lazy_tif_bit());
+}
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -5612,6 +5645,10 @@ void sched_tick(void)
 	update_rq_clock(rq);
 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+		resched_curr(rq);
+
 	donor->sched_class->task_tick(rq, donor, 0);
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
@@ -7374,6 +7411,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * VOLUNTARY:
  *   cond_resched               <- __cond_resched
@@ -7381,6 +7419,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- NOP
  *   preempt_schedule_notrace   <- NOP
  *   irqentry_exit_cond_resched <- NOP
+ *   dynamic_preempt_lazy       <- false
  *
  * FULL:
  *   cond_resched               <- RET0
@@ -7388,6 +7427,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
  *   preempt_schedule           <- preempt_schedule
  *   preempt_schedule_notrace   <- preempt_schedule_notrace
  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- false
+ *
+ * LAZY:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ *   dynamic_preempt_lazy       <- true
  */
 
 enum {
@@ -7395,6 +7443,7 @@ enum {
 	preempt_dynamic_none,
 	preempt_dynamic_voluntary,
 	preempt_dynamic_full,
+	preempt_dynamic_lazy,
 };
 
 int preempt_dynamic_mode = preempt_dynamic_undefined;
@@ -7410,15 +7459,23 @@ int sched_dynamic_mode(const char *str)
 	if (!strcmp(str, "full"))
 		return preempt_dynamic_full;
 
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+	if (!strcmp(str, "lazy"))
+		return preempt_dynamic_lazy;
+#endif
+
 	return -EINVAL;
 }
 
+#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
 #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
 #else
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
@@ -7438,6 +7495,7 @@ static void __sched_dynamic_update(int mode)
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
 	preempt_dynamic_enable(irqentry_exit_cond_resched);
+	preempt_dynamic_key_disable(preempt_lazy);
 
 	switch (mode) {
 	case preempt_dynamic_none:
@@ -7447,6 +7505,7 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: none\n");
 		break;
@@ -7458,6 +7517,7 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: voluntary\n");
 		break;
@@ -7469,9 +7529,22 @@ static void __sched_dynamic_update(int mode)
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_disable(preempt_lazy);
 		if (mode != preempt_dynamic_mode)
 			pr_info("Dynamic Preempt: full\n");
 		break;
+
+	case preempt_dynamic_lazy:
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
+		preempt_dynamic_key_enable(preempt_lazy);
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: lazy\n");
+		break;
 	}
 
 	preempt_dynamic_mode = mode;
@@ -7534,6 +7607,8 @@ static void __init preempt_dynamic_init(void)
 			sched_dynamic_update(preempt_dynamic_none);
 		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
 			sched_dynamic_update(preempt_dynamic_voluntary);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+			sched_dynamic_update(preempt_dynamic_lazy);
 		} else {
 			/* Default static call setting, nothing to do */
 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7554,6 +7629,7 @@ static void __init preempt_dynamic_init(void)
 PREEMPT_MODEL_ACCESSOR(none);
 PREEMPT_MODEL_ACCESSOR(voluntary);
 PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
 
 #else /* !CONFIG_PREEMPT_DYNAMIC: */
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f4035c7..44a49f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
 static int sched_dynamic_show(struct seq_file *m, void *v)
 {
 	static const char * preempt_modes[] = {
-		"none", "voluntary", "full"
+		"none", "voluntary", "full", "lazy",
 	};
+	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+	for (i = 0; i < j; i++) {
 		if (preempt_dynamic_mode == i)
 			seq_puts(m, "(");
 		seq_puts(m, preempt_modes[i]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6512258..3356315 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		return;
 
 	if (resched || did_preempt_short(cfs_rq, curr)) {
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 		clear_buddies(cfs_rq, curr);
 	}
 }
@@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 #endif
@@ -8829,7 +8829,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 }
 
 static struct task_struct *pick_task_fair(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e51bf5a..090dd4b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2689,6 +2689,7 @@ extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
 extern void resched_cpu(int cpu);
 
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-10-17 12:36 ` Mike Galbraith
@ 2024-11-07 17:21   ` Thomas Meyer
  2024-11-08  0:59     ` Mike Galbraith
  0 siblings, 1 reply; 57+ messages in thread
From: Thomas Meyer @ 2024-11-07 17:21 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: peterz, bigeasy, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, ankur.a.arora


Mike Galbraith <efault@gmx.de> writes:
> Full per run summaries attached, high speed scroll version below.
>
> desktop util 18.1% static voluntary - virgin source
> desktop util 18.3% static voluntary - +lazy patches
> desktop util 17.5% lazy             - ditto...
> desktop util 17.0% lazy
> desktop util 16.8% lazy
> desktop util 17.8% full
> desktop util 17.8% full
> desktop util 17.8% full

Can you please elaborate a bit more were those values, e.g. 18,1%, come from?
How to get those? I couldn't find a connection to your raw data.

Sorry for asking this probably stupid question,

mfg
thomas

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/5] sched: Lazy preemption muck
  2024-11-07 17:21   ` Thomas Meyer
@ 2024-11-08  0:59     ` Mike Galbraith
  0 siblings, 0 replies; 57+ messages in thread
From: Mike Galbraith @ 2024-11-08  0:59 UTC (permalink / raw)
  To: Thomas Meyer
  Cc: peterz, bigeasy, tglx, mingo, linux-kernel, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, ankur.a.arora

On Thu, 2024-11-07 at 18:21 +0100, Thomas Meyer wrote:
>
> Mike Galbraith <efault@gmx.de> writes:
> > Full per run summaries attached, high speed scroll version below.
> >
> > desktop util 18.1% static voluntary - virgin source
> > desktop util 18.3% static voluntary - +lazy patches
> > desktop util 17.5% lazy             - ditto...
> > desktop util 17.0% lazy
> > desktop util 16.8% lazy
> > desktop util 17.8% full
> > desktop util 17.8% full
> > desktop util 17.8% full
>
> Can you please elaborate a bit more were those values, e.g. 18,1%, come from?
> How to get those? I couldn't find a connection to your raw data.

I use a slightly twiddled perf to profile, ala perf sched record
<whatever load>, perf sched lat to emit the profile, then do the total
runtime accumulation vs competitor runtime arithmetic. Both competitor
and desktop load are selected for maximal hands off consistency, poke
start, watch yet another 5 minutes of BigBuckBunny to make sure the
internet doesn't hiccup during recording.. and done.

> Sorry for asking this probably stupid question,

Nah, the only silly question is one not pondered first.

	-Mike

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2024-11-08  0:59 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-07  7:46 [PATCH 0/5] sched: Lazy preemption muck Peter Zijlstra
2024-10-07  7:46 ` [PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Peter Zijlstra
2024-10-09 12:18   ` Sebastian Andrzej Siewior
2024-10-09 13:01     ` Peter Zijlstra
2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2024-10-07  7:46 ` [PATCH 2/5] sched: Add Lazy preemption model Peter Zijlstra
2024-10-08  5:43   ` Ankur Arora
2024-10-08 14:48     ` Peter Zijlstra
2024-10-09  8:50   ` Sebastian Andrzej Siewior
2024-10-09  9:14     ` Peter Zijlstra
2024-10-09  9:19       ` Sebastian Andrzej Siewior
2024-10-15 14:37   ` Shrikanth Hegde
2024-10-25 10:42     ` Sebastian Andrzej Siewior
2024-10-22 16:44   ` Shrikanth Hegde
2024-10-25 13:19     ` Sebastian Andrzej Siewior
2024-10-29 18:57       ` Shrikanth Hegde
2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2024-10-07  7:46 ` [PATCH 3/5] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT Peter Zijlstra
2024-10-08 13:24   ` Sebastian Andrzej Siewior
2024-10-08 14:40     ` Peter Zijlstra
2024-10-10  6:52   ` Christoph Hellwig
2024-10-10  7:50     ` Peter Zijlstra
2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2024-10-07  7:46 ` [PATCH 4/5] sched, x86: Enable Lazy preemption Peter Zijlstra
2024-11-06 10:48   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2024-10-07  7:46 ` [PATCH 5/5] sched: Add laziest preempt model Peter Zijlstra
2024-10-08  5:59   ` Ankur Arora
2024-10-08 14:23   ` Thomas Gleixner
2024-10-08 14:40     ` Peter Zijlstra
2024-10-08 15:07   ` Sebastian Andrzej Siewior
2024-10-07  8:33 ` [PATCH 0/5] sched: Lazy preemption muck Sebastian Andrzej Siewior
2024-10-08  4:58 ` Mike Galbraith
2024-10-08 15:32 ` Sebastian Andrzej Siewior
2024-10-09  4:40   ` Ankur Arora
2024-10-09  6:20     ` Sebastian Andrzej Siewior
2024-10-09  7:23       ` Ankur Arora
2024-10-09  8:02       ` Peter Zijlstra
2024-10-09  8:45         ` Sebastian Andrzej Siewior
2024-10-09 14:01         ` Steven Rostedt
2024-10-09 20:13           ` Thomas Gleixner
2024-10-09 20:43             ` Steven Rostedt
2024-10-09 21:06               ` Thomas Gleixner
2024-10-09 21:19                 ` Steven Rostedt
2024-10-09 23:16                   ` Thomas Gleixner
2024-10-09 23:29                     ` Steven Rostedt
2024-10-10  1:20                       ` Thomas Gleixner
2024-10-10 10:23                 ` David Laight
2024-10-13 19:02                   ` Thomas Gleixner
2024-10-14  8:21                     ` David Laight
2024-10-10  3:12               ` Tianchen Ding
2024-10-10  7:47                 ` Thomas Gleixner
2024-10-09  7:30   ` Ankur Arora
2024-10-09  7:46   ` Peter Zijlstra
2024-10-09 11:07 ` Sebastian Andrzej Siewior
2024-10-17 12:36 ` Mike Galbraith
2024-11-07 17:21   ` Thomas Meyer
2024-11-08  0:59     ` Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox