All of lore.kernel.org
 help / color / mirror / Atom feed
* [GIT PULL] sched: Fix missing preemption opportunity
@ 2015-01-22 17:08 Frederic Weisbecker
  2015-01-23  9:13 ` Peter Zijlstra
  2015-02-01 17:52 ` [tip:sched/core] " tip-bot for Frederic Weisbecker
  0 siblings, 2 replies; 4+ messages in thread
From: Frederic Weisbecker @ 2015-01-22 17:08 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML, Frederic Weisbecker, Peter Zijlstra, Linus Torvalds

Ingo,

Please pull the sched/urgent branch that can be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
	sched/urgent

HEAD: 4ccbe99852951957419bc616f888e297a478e50b

It's part of the preempt/schedule cleanups series suggested by Linus. I'm reworking
these but this one commit is an exception because it's a bugfix. So I'm sending it
now. I believe it's not a regression so it's perhaps too late for -rc5. I let you
judge. The commit is based on -rc5 so it can be merged on sched/core otherwise.

Thanks,
	Frederic
---

Frederic Weisbecker (1):
      sched: Fix missing preemption check in cond_resched()


 kernel/sched/core.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

---
commit 4ccbe99852951957419bc616f888e297a478e50b
Author: Frederic Weisbecker <fweisbec@gmail.com>
Date:   Sun Dec 14 22:04:47 2014 +0100

    sched: Fix missing preemption check in cond_resched()
    
    If an interrupt fires in cond_resched(), between the call to __schedule()
    and the PREEMPT_ACTIVE count decrementation, and that interrupt sets
    TIF_NEED_RESCHED, the call to preempt_schedule_irq() will be ignored
    due to the PREEMPT_ACTIVE count. This kind of scenario, with irq preemption
    being delayed because it's interrupting a preempt-disabled area, is
    usually fixed up after preemption is re-enabled back with an explicit
    call to preempt_schedule().
    
    This is what preempt_enable() does but a raw preempt count decrement as
    performed by __preempt_count_sub(PREEMPT_ACTIVE) doesn't handle delayed
    preemption check. Therefore when such a race happens, the rescheduling
    is going to be delayed until the next scheduler or preemption entrypoint.
    This can be a problem for scheduler latency sensitive workloads.
    
    Lets fix that by consolidating cond_resched() with preempt_schedule()
    internals.
    
    Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
    Reported-by: Ingo Molnar <mingo@kernel.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Original-patch-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc0..c7ed25d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2877,6 +2877,21 @@ void __sched schedule_preempt_disabled(void)
 	preempt_disable();
 }
 
+static void preempt_schedule_common(void)
+{
+	do {
+		__preempt_count_add(PREEMPT_ACTIVE);
+		__schedule();
+		__preempt_count_sub(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
+}
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2907,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 	if (likely(!preemptible()))
 		return;
 
-	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
-		__schedule();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
-	} while (need_resched());
+	preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
@@ -4202,17 +4207,10 @@ SYSCALL_DEFINE0(sched_yield)
 	return 0;
 }
 
-static void __cond_resched(void)
-{
-	__preempt_count_add(PREEMPT_ACTIVE);
-	__schedule();
-	__preempt_count_sub(PREEMPT_ACTIVE);
-}
-
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
-		__cond_resched();
+		preempt_schedule_common();
 		return 1;
 	}
 	return 0;
@@ -4237,7 +4235,7 @@ int __cond_resched_lock(spinlock_t *lock)
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
-			__cond_resched();
+			preempt_schedule_common();
 		else
 			cpu_relax();
 		ret = 1;
@@ -4253,7 +4251,7 @@ int __sched __cond_resched_softirq(void)
 
 	if (should_resched()) {
 		local_bh_enable();
-		__cond_resched();
+		preempt_schedule_common();
 		local_bh_disable();
 		return 1;
 	}

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [GIT PULL] sched: Fix missing preemption opportunity
  2015-01-22 17:08 [GIT PULL] sched: Fix missing preemption opportunity Frederic Weisbecker
@ 2015-01-23  9:13 ` Peter Zijlstra
  2015-01-23 15:07   ` Frederic Weisbecker
  2015-02-01 17:52 ` [tip:sched/core] " tip-bot for Frederic Weisbecker
  1 sibling, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2015-01-23  9:13 UTC (permalink / raw)
  To: Frederic Weisbecker; +Cc: Ingo Molnar, LKML, Linus Torvalds


I picked up the patch; will drop it if Ingo also does ;-)

On Thu, Jan 22, 2015 at 06:08:04PM +0100, Frederic Weisbecker wrote:
> +++ b/kernel/sched/core.c
> @@ -2877,6 +2877,21 @@ void __sched schedule_preempt_disabled(void)
>  	preempt_disable();
>  }
>  
> +static void preempt_schedule_common(void)
> +{
> +	do {
> +		__preempt_count_add(PREEMPT_ACTIVE);
> +		__schedule();
> +		__preempt_count_sub(PREEMPT_ACTIVE);
> +
> +		/*
> +		 * Check again in case we missed a preemption opportunity
> +		 * between schedule and now.
> +		 */
> +		barrier();

I do however wonder about this barrier() here; why do we think we need
it?

Is that because test_bit() it 'broken'? The bitops are typically atomic
ops and atomic reads should be through a volatile cast (x86
constant_test_bit doesn't seem to do this).

Should we go audit and fix that?

> +	} while (need_resched());
> +}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [GIT PULL] sched: Fix missing preemption opportunity
  2015-01-23  9:13 ` Peter Zijlstra
@ 2015-01-23 15:07   ` Frederic Weisbecker
  0 siblings, 0 replies; 4+ messages in thread
From: Frederic Weisbecker @ 2015-01-23 15:07 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML, Linus Torvalds

On Fri, Jan 23, 2015 at 10:13:53AM +0100, Peter Zijlstra wrote:
> 
> I picked up the patch; will drop it if Ingo also does ;-)
> 
> On Thu, Jan 22, 2015 at 06:08:04PM +0100, Frederic Weisbecker wrote:
> > +++ b/kernel/sched/core.c
> > @@ -2877,6 +2877,21 @@ void __sched schedule_preempt_disabled(void)
> >  	preempt_disable();
> >  }
> >  
> > +static void preempt_schedule_common(void)
> > +{
> > +	do {
> > +		__preempt_count_add(PREEMPT_ACTIVE);
> > +		__schedule();
> > +		__preempt_count_sub(PREEMPT_ACTIVE);
> > +
> > +		/*
> > +		 * Check again in case we missed a preemption opportunity
> > +		 * between schedule and now.
> > +		 */
> > +		barrier();
> 
> I do however wonder about this barrier() here; why do we think we need
> it?
> 
> Is that because test_bit() it 'broken'? The bitops are typically atomic
> ops and atomic reads should be through a volatile cast (x86
> constant_test_bit doesn't seem to do this).
> 
> Should we go audit and fix that?

I looked up with git blame and this was already there prior the first git commit v2.6.12
without appropriate explanation.

We must make sure that the PREEMPT_ACTIVE decrement is visible before we do the NEED_RESCHED
test or an interrupt could spuriously miss a preempt_schedule_irq() opportunity.

__preempt_count_sub() in asm-generic is an inline, so an implicit barrier(). Only x86
overwrites it yet and it does so through an inline as well.

And __preempt_count_ops() must really imply a barrier() anyway, anything else would
be insane (probably we should specify that in a comment somewhere). Although I see
that responsability is taken from non-underscored callers...

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip:sched/core] sched: Fix missing preemption opportunity
  2015-01-22 17:08 [GIT PULL] sched: Fix missing preemption opportunity Frederic Weisbecker
  2015-01-23  9:13 ` Peter Zijlstra
@ 2015-02-01 17:52 ` tip-bot for Frederic Weisbecker
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot for Frederic Weisbecker @ 2015-02-01 17:52 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: fweisbec, hpa, linux-kernel, peterz, mingo, torvalds, tglx

Commit-ID:  a18b5d01819235629289212ad428a5ee2b40f0d9
Gitweb:     http://git.kernel.org/tip/a18b5d01819235629289212ad428a5ee2b40f0d9
Author:     Frederic Weisbecker <fweisbec@gmail.com>
AuthorDate: Thu, 22 Jan 2015 18:08:04 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 30 Jan 2015 19:38:51 +0100

sched: Fix missing preemption opportunity

If an interrupt fires in cond_resched(), between the call to __schedule()
and the PREEMPT_ACTIVE count decrementation, and that interrupt sets
TIF_NEED_RESCHED, the call to preempt_schedule_irq() will be ignored
due to the PREEMPT_ACTIVE count. This kind of scenario, with irq preemption
being delayed because it's interrupting a preempt-disabled area, is
usually fixed up after preemption is re-enabled back with an explicit
call to preempt_schedule().

This is what preempt_enable() does but a raw preempt count decrement as
performed by __preempt_count_sub(PREEMPT_ACTIVE) doesn't handle delayed
preemption check. Therefore when such a race happens, the rescheduling
is going to be delayed until the next scheduler or preemption entrypoint.
This can be a problem for scheduler latency sensitive workloads.

Lets fix that by consolidating cond_resched() with preempt_schedule()
internals.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@kernel.org>
Original-patch-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1421946484-9298-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b591fe..54dce01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2884,6 +2884,21 @@ void __sched schedule_preempt_disabled(void)
 	preempt_disable();
 }
 
+static void preempt_schedule_common(void)
+{
+	do {
+		__preempt_count_add(PREEMPT_ACTIVE);
+		__schedule();
+		__preempt_count_sub(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
+}
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
@@ -2899,17 +2914,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 	if (likely(!preemptible()))
 		return;
 
-	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
-		__schedule();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
-	} while (need_resched());
+	preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
@@ -4209,17 +4214,10 @@ SYSCALL_DEFINE0(sched_yield)
 	return 0;
 }
 
-static void __cond_resched(void)
-{
-	__preempt_count_add(PREEMPT_ACTIVE);
-	__schedule();
-	__preempt_count_sub(PREEMPT_ACTIVE);
-}
-
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
-		__cond_resched();
+		preempt_schedule_common();
 		return 1;
 	}
 	return 0;
@@ -4244,7 +4242,7 @@ int __cond_resched_lock(spinlock_t *lock)
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
-			__cond_resched();
+			preempt_schedule_common();
 		else
 			cpu_relax();
 		ret = 1;
@@ -4260,7 +4258,7 @@ int __sched __cond_resched_softirq(void)
 
 	if (should_resched()) {
 		local_bh_enable();
-		__cond_resched();
+		preempt_schedule_common();
 		local_bh_disable();
 		return 1;
 	}

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-02-01 17:53 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-01-22 17:08 [GIT PULL] sched: Fix missing preemption opportunity Frederic Weisbecker
2015-01-23  9:13 ` Peter Zijlstra
2015-01-23 15:07   ` Frederic Weisbecker
2015-02-01 17:52 ` [tip:sched/core] " tip-bot for Frederic Weisbecker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.