All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Andy Lutomirski <luto@amacapital.net>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Mike Galbraith <bitbucket@online.de>, X86 ML <x86@kernel.org>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC] sched: Add a new lockless wake-from-idle implementation
Date: Thu, 13 Feb 2014 15:50:16 +0100	[thread overview]
Message-ID: <20140213145016.GA15586@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <218b548b69479baa005af8a7b04a3abbea8ed6fa.1392252790.git.luto@amacapital.net>

On Wed, Feb 12, 2014 at 05:40:12PM -0800, Andy Lutomirski wrote:
> This is a strawman proposal to simplify the idle implementation, eliminate
> a race
> 
> Benefits over current code:
>  - ttwu_queue_remote doesn't use an IPI unless needed
>  - The diffstat should speak for itself :)
>  - Less racy.  Spurious IPIs are possible, but only in narrow windows or
>    when two wakeups occur in rapid succession.
>  - Seems to work (?)
> 
> Issues:
>  - Am I doing the percpu stuff right?
>  - Needs work on non-x86 architectures
>  - The !CONFIG_SMP case needs to be checked
>  - Is "idlepoll" a good name for the new code?  It doesn't have *that*
>    much to do with the idle state.  Maybe cpukick?
> 
> If this turns out okay, TIF_NEED_RESCHED could possibly be deleted as well.

No, we can't do away with that; its used in some fairly critical paths
(return to userspace) and adding a second cacheline load there would be
unfortunate.

I also don't really like how the polling state is an atomic; its a cpu
local property.

Now given we can't get rid of TIF_NEED_RESCHED, and we need an atomic op
on a remote cacheline anyhow; the simplest solution would be to convert
all TS_POLLING users to TIF_POLLING_NRFLAG and use an atomic_or_return()
like construct to do:

  atomic_or_return(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG

and avoid the IPI if that is false.

Something a little like this; it does require a lot of auditing; but it
boots on my x86_64.

---
 arch/x86/include/asm/mwait.h       | 19 ++++----
 arch/x86/include/asm/thread_info.h |  4 +-
 arch/x86/kernel/process.c          |  8 ++--
 include/linux/sched.h              | 90 +++-----------------------------------
 kernel/sched/core.c                | 70 ++++++++++++++++++-----------
 kernel/sched/idle.c                | 40 ++++++++---------
 kernel/sched/sched.h               |  3 ++
 7 files changed, 88 insertions(+), 146 deletions(-)

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5f96f9..d6a7b7cdc478 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -42,18 +42,15 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
  */
 static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
-	if (!current_set_polling_and_test()) {
-		if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
-			mb();
-			clflush((void *)&current_thread_info()->flags);
-			mb();
-		}
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		if (!need_resched())
-			__mwait(eax, ecx);
+	if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+		mb();
+		clflush((void *)&current_thread_info()->flags);
+		mb();
 	}
-	current_clr_polling();
+
+	__monitor((void *)&current_thread_info()->flags, 0, 0);
+	if (!need_resched())
+		__mwait(eax, ecx);
 }
 
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e1940c06ed02..1f2fe575cfca 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -88,6 +88,7 @@ struct thread_info {
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG	21
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
@@ -111,6 +112,7 @@ struct thread_info {
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
@@ -234,8 +236,6 @@ static inline struct thread_info *current_thread_info(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
-#define TS_POLLING		0x0004	/* idle task polling need_resched,
-					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..298c002629c6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -306,9 +306,11 @@ void arch_cpu_idle(void)
  */
 void default_idle(void)
 {
-	trace_cpu_idle_rcuidle(1, smp_processor_id());
-	safe_halt();
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	if (!current_clr_polling_and_test()) {
+		trace_cpu_idle_rcuidle(1, smp_processor_id());
+		safe_halt();
+		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	}
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(default_idle);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c49a2585ff7d..b326abe95978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2692,85 +2692,26 @@ static inline int spin_needbreak(spinlock_t *lock)
  * thread_info.status and one based on TIF_POLLING_NRFLAG in
  * thread_info.flags
  */
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
-	return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
-	current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb();
-
-	return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
+#ifdef CONFIG_SMP
 static inline int tsk_is_polling(struct task_struct *p)
 {
 	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
 }
 
-static inline void __current_set_polling(void)
+static inline void current_set_polling(void)
 {
 	set_thread_flag(TIF_POLLING_NRFLAG);
 }
 
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 *
-	 * XXX: assumes set/clear bit are identical barrier wise.
-	 */
-	smp_mb__after_clear_bit();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
+static inline void current_clr_polling(void)
 {
 	clear_thread_flag(TIF_POLLING_NRFLAG);
 }
 
 static inline bool __must_check current_clr_polling_and_test(void)
 {
-	__current_clr_polling();
+	current_clr_polling();
 
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
 	smp_mb__after_clear_bit();
 
 	return unlikely(tif_need_resched());
@@ -2778,34 +2719,15 @@ static inline bool __must_check current_clr_polling_and_test(void)
 
 #else
 static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
+static inline void current_set_polling(void) { }
+static inline void current_clr_polling(void) { }
 
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	return unlikely(tif_need_resched());
-}
 static inline bool __must_check current_clr_polling_and_test(void)
 {
 	return unlikely(tif_need_resched());
 }
 #endif
 
-static inline void current_clr_polling(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-	 * Once the bit is cleared, we'll get IPIs with every new
-	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-	 * fold.
-	 */
-	smp_mb(); /* paired with resched_task() */
-
-	preempt_fold_need_resched();
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb9764fbc537..19de9a1cc96c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -504,45 +504,63 @@ static inline void init_hrtick(void)
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
+static void __resched_cpu(int cpu)
 {
-	int cpu;
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *p;
+	struct thread_info *ti;
+	unsigned long val, old, new;
+	bool ipi;
 
-	lockdep_assert_held(&task_rq(p)->lock);
+	rcu_read_lock();
+	do {
+		p = ACCESS_ONCE(rq->curr);
+		ti = task_thread_info(p);
+
+		val = ti->flags;
+		for (;;) {
+			new = val | _TIF_NEED_RESCHED;
+			old = cmpxchg(&ti->flags, val, new);
+			if (old == val)
+				break;
+			val = old;
+		}
 
-	if (test_tsk_need_resched(p))
-		return;
+		ipi = !(old & _TIF_POLLING_NRFLAG);
 
-	set_tsk_need_resched(p);
+	} while (p != ACCESS_ONCE(rq->curr));
+	rcu_read_unlock();
 
-	cpu = task_cpu(p);
+	if (ipi)
+		smp_send_reschedule(cpu);
+}
+
+void resched_cpu(int cpu)
+{
 	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(current);
 		set_preempt_need_resched();
 		return;
 	}
 
-	/* NEED_RESCHED must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(p))
-		smp_send_reschedule(cpu);
+	__resched_cpu(cpu);
 }
 
-void resched_cpu(int cpu)
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
+	lockdep_assert_held(&task_rq(p)->lock);
 
-	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+	if (test_tsk_need_resched(p))
 		return;
-	resched_task(cpu_curr(cpu));
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	resched_cpu(task_cpu(p));
 }
 
 #ifdef CONFIG_SMP
@@ -1476,7 +1494,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 
 #ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
@@ -2689,7 +2707,7 @@ static void __sched __schedule(void)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
-	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched(next);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 14ca43430aee..03748737473e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -11,6 +11,7 @@
 #include <asm/tlb.h>
 
 #include <trace/events/power.h>
+#include "sched.h"
 
 static int __read_mostly cpu_idle_force_poll;
 
@@ -71,6 +72,8 @@ static void cpu_idle_loop(void)
 	while (1) {
 		tick_nohz_idle_enter();
 
+		current_set_polling();
+
 		while (!need_resched()) {
 			check_pgt_cache();
 			rmb();
@@ -93,29 +96,27 @@ static void cpu_idle_loop(void)
 			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
 				cpu_idle_poll();
 			} else {
-				if (!current_clr_polling_and_test()) {
-					stop_critical_timings();
-					rcu_idle_enter();
-					if (cpuidle_idle_call())
-						arch_cpu_idle();
-					if (WARN_ON_ONCE(irqs_disabled()))
-						local_irq_enable();
-					rcu_idle_exit();
-					start_critical_timings();
-				} else {
+				stop_critical_timings();
+				rcu_idle_enter();
+				if (cpuidle_idle_call())
+					arch_cpu_idle();
+				if (WARN_ON_ONCE(irqs_disabled()))
 					local_irq_enable();
-				}
-				__current_set_polling();
+				rcu_idle_exit();
+				start_critical_timings();
 			}
 			arch_cpu_idle_exit();
-			/*
-			 * We need to test and propagate the TIF_NEED_RESCHED
-			 * bit here because we might not have send the
-			 * reschedule IPI to idle tasks.
-			 */
-			if (tif_need_resched())
-				set_preempt_need_resched();
 		}
+
+		current_clr_polling();
+
+		/*
+		 * We need to test and propagate the TIF_NEED_RESCHED bit here
+		 * because we might not have send the reschedule IPI to idle
+		 * tasks.
+		 */
+		set_preempt_need_resched();
+		sched_ttwu_pending();
 		tick_nohz_idle_exit();
 		schedule_preempt_disabled();
 	}
@@ -138,7 +139,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 	 */
 	boot_init_stack_canary();
 #endif
-	__current_set_polling();
 	arch_cpu_idle_prepare();
 	cpu_idle_loop();
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1bf34c257d3b..269b1a4e0bdf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1160,6 +1160,7 @@ extern const struct sched_class idle_sched_class;
 
 #ifdef CONFIG_SMP
 
+extern void sched_ttwu_pending(void);
 extern void update_group_power(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
@@ -1170,6 +1171,8 @@ extern void idle_exit_fair(struct rq *this_rq);
 
 #else	/* CONFIG_SMP */
 
+static inline void sched_ttwu_pending(void) { }
+
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }




  parent reply	other threads:[~2014-02-13 14:50 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-02-11 20:23 Too many rescheduling interrupts (still!) Andy Lutomirski
2014-02-11 21:21 ` Thomas Gleixner
2014-02-11 22:34   ` Andy Lutomirski
2014-02-12 10:13     ` Peter Zijlstra
2014-02-12 15:49       ` Andy Lutomirski
2014-02-12 16:39         ` Peter Zijlstra
2014-02-12 18:19           ` Andy Lutomirski
2014-02-12 20:17             ` Peter Zijlstra
2014-02-13  1:40               ` [RFC] sched: Add a new lockless wake-from-idle implementation Andy Lutomirski
2014-02-13  9:38                 ` Ingo Molnar
2014-02-13 14:49                 ` Frederic Weisbecker
2014-02-13 14:50                 ` Peter Zijlstra [this message]
2014-02-13 17:07                   ` Andy Lutomirski
2014-02-13 20:16                     ` Peter Zijlstra
2014-02-13 20:35                       ` Andy Lutomirski
2014-02-13 19:58                   ` Andy Lutomirski
2014-02-14  1:38                     ` Andy Lutomirski
2014-02-14 20:01                   ` Andy Lutomirski
2014-02-14 20:17                     ` Andy Lutomirski
2014-02-14 21:19                       ` Peter Zijlstra
2014-02-12 15:59       ` Too many rescheduling interrupts (still!) Frederic Weisbecker
2014-02-12 16:43         ` Peter Zijlstra
2014-02-12 17:46           ` Frederic Weisbecker
2014-02-12 18:15             ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140213145016.GA15586@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=bitbucket@online.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.