public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: bsegall@google.com
Cc: Roman Gushchin <klamm@yandex-team.ru>,
	linux-kernel@vger.kernel.org, pjt@google.com,
	chris.j.arges@canonical.com, gregkh@linuxfoundation.org,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [PATCH] sched: tg_set_cfs_bandwidth() causes rq->lock deadlock
Date: Tue, 20 May 2014 16:55:54 +0200	[thread overview]
Message-ID: <20140520145554.GT13658@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <20140520140155.GS13658@twins.programming.kicks-ass.net>

[-- Attachment #1: Type: text/plain, Size: 9786 bytes --]

On Tue, May 20, 2014 at 04:01:55PM +0200, Peter Zijlstra wrote:

> Hmm,. doesn't this also mean its entirely unsafe to call
> hrtimer_forward*() from the timer callback, because it might be changing
> the time of an already enqueued timer, which would corrupt the rb-tree
> order.
> 
> Lemme go find a nice way out of this mess, I think I'm responsible for
> creating it in the first place :-(

Ah, so since the hrtimer doesn't provide any serialization of its own,
per design, we should serialize the access to it ourselves, which is
easily done.

Something like so would do I suppose..

Here we make sure to be holding the bandwidth lock (be it cfs_b->lock or
rt_b->rt_runtime_lock) around both starting the timer and in the handler
while modifying its expiration date.

---
 kernel/hrtimer.c     |  9 +++++---
 kernel/sched/core.c  | 12 ++++++----
 kernel/sched/fair.c  | 65 +++++++++++-----------------------------------------
 kernel/sched/rt.c    | 14 +++++------
 kernel/sched/sched.h |  4 ++--
 5 files changed, 34 insertions(+), 70 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3ab28993f6e0..28942c65635e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1273,11 +1273,14 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
+	 *
+	 * Note: Because we dropped the cpu_base->lock above,
+	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
+	 * for us already.
 	 */
-	if (restart != HRTIMER_NORESTART) {
-		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+	if (restart != HRTIMER_NORESTART &&
+	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
-	}
 
 	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ea7b3f1a087..1f4602993d37 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -106,17 +106,17 @@ void __smp_mb__after_atomic(void)
 EXPORT_SYMBOL(__smp_mb__after_atomic);
 #endif
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+int start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
-	ktime_t soft, hard, now;
+	ktime_t soft, hard;
+	int overruns = 0;
 
 	for (;;) {
-		if (hrtimer_active(period_timer))
+		if (hrtimer_is_queued(period_timer))
 			break;
 
-		now = hrtimer_cb_get_time(period_timer);
-		hrtimer_forward(period_timer, now, period);
+		overruns += hrtimer_forward_now(period_timer, period);
 
 		soft = hrtimer_get_softexpires(period_timer);
 		hard = hrtimer_get_expires(period_timer);
@@ -124,6 +124,8 @@ void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 		__hrtimer_start_range_ns(period_timer, soft, delta,
 					 HRTIMER_MODE_ABS_PINNED, 0);
 	}
+
+	return overruns;
 }
 
 DEFINE_MUTEX(sched_domains_mutex);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28ccf502c63c..11152b621a31 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3146,15 +3146,13 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 		amount = min_amount;
 	else {
 		/*
-		 * If the bandwidth pool has become inactive, then at least one
+		 * If we had overruns starting the timer, then at least one
 		 * period must have elapsed since the last consumption.
 		 * Refresh the global state and ensure bandwidth timer becomes
 		 * active.
 		 */
-		if (!cfs_b->timer_active) {
+		if (start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period))
 			__refill_cfs_bandwidth_runtime(cfs_b);
-			__start_cfs_bandwidth(cfs_b);
-		}
 
 		if (cfs_b->runtime > 0) {
 			amount = min(cfs_b->runtime, min_amount);
@@ -3331,8 +3329,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-	if (!cfs_b->timer_active)
-		__start_cfs_bandwidth(cfs_b);
+	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -3430,12 +3427,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
 	u64 runtime, runtime_expires;
-	int idle = 1, throttled;
+	bool idle, throttled;
+
+	lockdep_assert_held(&cfs_b->lock);
 
-	raw_spin_lock(&cfs_b->lock);
 	/* no need to continue the timer with no bandwidth constraint */
 	if (cfs_b->quota == RUNTIME_INF)
-		goto out_unlock;
+		return 1;
 
 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	/* idle depends on !throttled (for the case of a large deficit) */
@@ -3444,21 +3442,14 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 
 	/* if we're going inactive then everything else can be deferred */
 	if (idle)
-		goto out_unlock;
-
-	/*
-	 * if we have relooped after returning idle once, we need to update our
-	 * status as actually running, so that other cpus doing
-	 * __start_cfs_bandwidth will stop trying to cancel us.
-	 */
-	cfs_b->timer_active = 1;
+		return 1;
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
 	if (!throttled) {
 		/* mark as potentially idle for the upcoming period */
 		cfs_b->idle = 1;
-		goto out_unlock;
+		return 0;
 	}
 
 	/* account preceding periods in which throttling occurred */
@@ -3498,12 +3489,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	 * timer to remain active while there are any throttled entities.)
 	 */
 	cfs_b->idle = 0;
-out_unlock:
-	if (idle)
-		cfs_b->timer_active = 0;
-	raw_spin_unlock(&cfs_b->lock);
-
-	return idle;
+	return 0;
 }
 
 /* a cfs_rq won't donate quota below this amount */
@@ -3676,19 +3662,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
-	ktime_t now;
 	int overrun;
 	int idle = 0;
 
+	raw_spin_lock(&cfs_b->lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+		overrun = hrtimer_forward_now(timer, cfs_b->period);
 		if (!overrun)
 			break;
 
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
+	raw_spin_unlock(&cfs_b->lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -3713,30 +3698,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-	/*
-	 * The timer may be active because we're trying to set a new bandwidth
-	 * period or because we're racing with the tear-down path
-	 * (timer_active==0 becomes visible before the hrtimer call-back
-	 * terminates).  In either case we ensure that it's re-programmed
-	 */
-	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-		/* bounce the lock to allow do_sched_cfs_period_timer to run */
-		raw_spin_unlock(&cfs_b->lock);
-		cpu_relax();
-		raw_spin_lock(&cfs_b->lock);
-		/* if someone else restarted the timer then we're done */
-		if (cfs_b->timer_active)
-			return;
-	}
-
-	cfs_b->timer_active = 1;
-	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
-}
-
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	hrtimer_cancel(&cfs_b->period_timer);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7795e292f4c9..60bfb8caa27a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -17,19 +17,20 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
-	ktime_t now;
-	int overrun;
 	int idle = 0;
+	int overrun;
 
+	raw_spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
 		if (!overrun)
 			break;
 
+		raw_spin_unlock(&rt_b->rt_runtime_lock);
 		idle = do_sched_rt_period_timer(rt_b, overrun);
+		raw_spin_lock(&rt_b->rt_runtime_lock);
 	}
+	raw_spin_unlock(&rt_b->rt_runtime_lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -51,9 +52,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
-	if (hrtimer_active(&rt_b->rt_period_timer))
-		return;
-
 	raw_spin_lock(&rt_b->rt_runtime_lock);
 	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2cbe81308af..0937e341e079 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -187,7 +187,7 @@ struct cfs_bandwidth {
 	s64 hierarchal_quota;
 	u64 runtime_expires;
 
-	int idle, timer_active;
+	int idle;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -1288,7 +1288,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+extern int start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

  reply	other threads:[~2014-05-20 14:56 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-15 16:58 [PATCH] sched: tg_set_cfs_bandwidth() causes rq->lock deadlock Roman Gushchin
2014-05-15 17:43 ` bsegall
2014-05-16  8:38   ` Roman Gushchin
2014-05-16 17:57     ` bsegall
2014-05-19 11:10       ` Roman Gushchin
2014-05-19 17:40         ` bsegall
2014-05-19 10:32     ` Peter Zijlstra
2014-05-19 11:37       ` Roman Gushchin
2014-05-19 17:35         ` bsegall
2014-05-19 17:30       ` bsegall
2014-05-20 13:15         ` Peter Zijlstra
2014-05-20 14:01           ` Peter Zijlstra
2014-05-20 14:55             ` Peter Zijlstra [this message]
2014-05-20 14:21           ` Roman Gushchin
2014-05-20 14:28             ` Peter Zijlstra
2014-05-20 14:34               ` Roman Gushchin
2014-05-20 19:08           ` bsegall
2014-05-21  7:58             ` Peter Zijlstra
2014-05-19 10:40   ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140520145554.GT13658@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=bsegall@google.com \
    --cc=chris.j.arges@canonical.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=klamm@yandex-team.ru \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pjt@google.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox