[PATCH 8/8] sched: rt-group: smp balancing

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Ingo Molnar <mingo@elte.hu>, linux-kernel@vger.kernel.org
Cc: tong.n.li@intel.com, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH 8/8] sched: rt-group: smp balancing
Date: Mon, 04 Feb 2008 22:03:06 +0100	[thread overview]
Message-ID: <20080204211837.889013000@chello.nl> (raw)
In-Reply-To: 20080204210258.118479000@chello.nl

[-- Attachment #1: sched-rt-group-smp.patch --]
[-- Type: text/plain, Size: 7868 bytes --]

Currently the rt group scheduling does a per cpu runtime limit, however
the rt load balancer makes no guarantees about an equal spread of real-
time tasks, just that at any one time, the highest priority tasks run.

Solve this by making the runtime limit a global property by borrowing
excessive runtime from the other cpus once the local limit runs out.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |   38 ++++++++++++++++++++++-
 kernel/sched_rt.c |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 121 insertions(+), 5 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -169,6 +169,7 @@ static inline ktime_t ns_to_ktime(u64 ns
 struct rt_bandwidth {
 	ktime_t rt_period;
 	u64 rt_runtime;
+	spinlock_t rt_runtime_lock;
 	struct hrtimer rt_period_timer;
 };
 
@@ -203,6 +204,8 @@ void init_rt_bandwidth(struct rt_bandwid
 	rt_b->rt_period = ns_to_ktime(period);
 	rt_b->rt_runtime = runtime;
 
+	spin_lock_init(&rt_b->rt_runtime_lock);
+
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
@@ -471,6 +474,8 @@ struct rt_rq {
 #endif
 	int rt_throttled;
 	u64 rt_time;
+	u64 rt_runtime;
+	spinlock_t rt_runtime_lock;
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
@@ -7216,6 +7221,8 @@ static void init_rt_rq(struct rt_rq *rt_
 
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
+	rt_rq->rt_runtime = 0;
+	spin_lock_init(&rt_rq->rt_runtime_lock);
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	rt_rq->rt_nr_boosted = 0;
@@ -7252,6 +7259,7 @@ static void init_tg_rt_entry(struct rq *
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_se = rt_se;
+	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (add)
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
@@ -7307,6 +7315,8 @@ void __init sched_init(void)
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
+#else
+		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8007,15 +8017,26 @@ static int __rt_schedulable(struct task_
 static int tg_set_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
-	int err = 0;
+	int i, err = 0;
 
 	mutex_lock(&rt_constraints_mutex);
 	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
 		err = -EINVAL;
 		goto unlock;
 	}
+
+	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+	for_each_possible_cpu(i) {
+		struct rt_rq *rt_rq = tg->rt_rq[i];
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = rt_runtime;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+	}
+	spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
  unlock:
 	mutex_unlock(&rt_constraints_mutex);
 
@@ -8079,6 +8100,19 @@ static int sched_rt_global_constraints(v
 #else
 static int sched_rt_global_constraints(void)
 {
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+	for_each_possible_cpu(i) {
+		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = global_rt_runtime();
+		spin_unlock(&rt_rq->rt_runtime_lock);
+	}
+	spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
 	return 0;
 }
 #endif
@@ -8195,7 +8229,7 @@ static u64 cpu_shares_read_uint(struct c
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struc
 	if (!rt_rq->tg)
 		return RUNTIME_INF;
 
-	return rt_rq->tg->rt_bandwidth.rt_runtime;
+	return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -145,11 +150,21 @@ struct rt_rq *sched_rt_period_rt_rq(stru
 	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 }
 
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+	return &rt_rq->tg->rt_bandwidth;
+}
+
 #else
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-	return def_rt_bandwidth.rt_runtime;
+	return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -200,6 +215,11 @@ struct rt_rq *sched_rt_period_rt_rq(stru
 	return &cpu_rq(cpu)->rt;
 }
 
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+	return &def_rt_bandwidth;
+}
+
 #endif
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
@@ -218,8 +238,10 @@ static int do_sched_rt_period_timer(stru
 
 		spin_lock(&rq->lock);
 		if (rt_rq->rt_time) {
-			u64 runtime = rt_b->rt_runtime;
+			u64 runtime;
 
+			spin_lock(&rt_rq->rt_runtime_lock);
+			runtime = rt_rq->rt_runtime;
 			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 				rt_rq->rt_throttled = 0;
@@ -227,6 +249,7 @@ static int do_sched_rt_period_timer(stru
 			}
 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 				idle = 0;
+			spin_unlock(&rt_rq->rt_runtime_lock);
 		}
 
 		if (enqueue)
@@ -237,6 +260,47 @@ static int do_sched_rt_period_timer(stru
 	return idle;
 }
 
+#ifdef CONFIG_SMP
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+	int i, weight, more = 0;
+	u64 rt_period;
+
+	weight = cpus_weight(rd->span);
+
+	spin_lock(&rt_b->rt_runtime_lock);
+	rt_period = ktime_to_ns(rt_b->rt_period);
+	for_each_cpu_mask(i, rd->span) {
+		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+		s64 diff;
+
+		if (iter == rt_rq)
+			continue;
+
+		spin_lock(&iter->rt_runtime_lock);
+		diff = iter->rt_runtime - iter->rt_time;
+		if (diff > 0) {
+			diff /= weight;
+			if (rt_rq->rt_runtime + diff > rt_period)
+				diff = rt_period - rt_rq->rt_runtime;
+			iter->rt_runtime -= diff;
+			rt_rq->rt_runtime += diff;
+			more = 1;
+			if (rt_rq->rt_runtime == rt_period) {
+				spin_unlock(&iter->rt_runtime_lock);
+				break;
+			}
+		}
+		spin_unlock(&iter->rt_runtime_lock);
+	}
+	spin_unlock(&rt_b->rt_runtime_lock);
+
+	return more;
+}
+#endif
+
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -259,6 +323,22 @@ static int sched_rt_runtime_exceeded(str
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
+	if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+		return 0;
+
+#ifdef CONFIG_SMP
+	if (rt_rq->rt_time > runtime) {
+		int more;
+
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		more = balance_runtime(rt_rq);
+		spin_lock(&rt_rq->rt_runtime_lock);
+
+		if (more)
+			runtime = sched_rt_runtime(rt_rq);
+	}
+#endif
+
 	if (rt_rq->rt_time > runtime) {
 		rt_rq->rt_throttled = 1;
 		if (rt_rq_throttled(rt_rq)) {
@@ -294,9 +374,11 @@ static void update_curr_rt(struct rq *rq
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	spin_lock(&rt_rq->rt_runtime_lock);
 	rt_rq->rt_time += delta_exec;
 	if (sched_rt_runtime_exceeded(rt_rq))
 		resched_task(curr);
+	spin_unlock(&rt_rq->rt_runtime_lock);
 }
 
 static inline

--

     prev parent reply	other threads:[~2008-02-04 21:17 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20080204210258.118479000@chello.nl>
2008-02-04 21:02 ` [PATCH 1/8] sched: fix incorrect irq lock usage in normalize_rt_tasks() Peter Zijlstra
2008-02-04 21:03 ` [PATCH 2/8] sched: rt-group: deal with PI Peter Zijlstra
2008-02-04 21:03 ` [PATCH 3/8] sched: rt-group: interface Peter Zijlstra
2008-02-06  1:31   ` Randy Dunlap
2008-02-23 19:48   ` Paul Menage
2008-02-23 19:57     ` Peter Zijlstra
2008-02-23 20:02       ` Paul Menage
2008-02-23 20:26         ` Peter Zijlstra
2008-02-23 20:36           ` Paul Menage
2008-02-04 21:03 ` [PATCH 4/8] sched: rt-group: make rt groups scheduling configurable Peter Zijlstra
2008-02-04 21:03 ` [PATCH 5/8] sched: rt-group: clean up the ifdeffery Peter Zijlstra
2008-02-04 21:03 ` [PATCH 6/8] sched: rt-group: refure unrunnable tasks Peter Zijlstra
2008-02-04 21:03 ` [PATCH 7/8] sched: rt-group: synchonised bandwidth period Peter Zijlstra
2008-02-04 21:03 ` Peter Zijlstra [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080204211837.889013000@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tong.n.li@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.