[RFC 9/12][PATCH] SCHED_DEADLINE: system wide bandwidth management

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Raistlin <raistlin@linux.it>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	michael trimarchi <michael@evidence.eu.com>,
	Fabio Checconi <fabio@gandalf.sssup.it>,
	Ingo Molnar <mingo@elte.hu>, Thomas Gleixner <tglx@linutronix.de>,
	Dhaval Giani <dhaval.giani@gmail.com>,
	Johan Eker <johan.eker@ericsson.com>,
	"p.faure" <p.faure@akatech.ch>,
	Chris Friesen <cfriesen@nortel.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Henrik Austad <henrik@austad.us>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Darren Hart <darren@dvhart.com>,
	Sven-Thorsten Dietrich <sven@thebigcorporation.com>,
	Bjoern Brandenburg <bbb@cs.unc.edu>,
	Tommaso Cucinotta <tommaso.cucinotta@sssup.it>,
	"giuseppe.lipari" <giuseppe.lipari@sssup.it>,
	Juri Lelli <juri.lelli@gmail.com>
Subject: [RFC 9/12][PATCH] SCHED_DEADLINE: system wide bandwidth management
Date: Fri, 16 Oct 2009 17:45:40 +0200	[thread overview]
Message-ID: <1255707940.6228.464.camel@Palantir> (raw)
In-Reply-To: <1255707324.6228.448.camel@Palantir>

[-- Attachment #1: Type: text/plain, Size: 8513 bytes --]

This commit adds the capability of controlling the maximum, system wide,
CPU bandwidth that is devoted to SCHED_DEADLINE tasks.

This is done by means of two files:
 - /proc/sys/kernel/sched_deadline_runtime_us,
 - /proc/sys/kernel/sched_deadline_period_us.
The ratio runtime/period is the total bandwidth all the SCHED_DEADLINE tasks
can use in the system as a whole.
Trying to create tasks in such a way that they exceed this limitation will
fail, as soon as the bandwidth cap would be overcome.

Default value is _zero_ bandwidth available, thus write some numbers in those
files before trying to start some SCHED_DEADLINE task. Setting runtime > period
is allowed (i.e., more than 100% bandwidth available for -deadline tasks),
since it makes more than sense in SMP systems.

Signed-off-by: Raistlin <raistlin@linux.it>
---
 include/linux/sched.h |    7 ++
 kernel/sched.c        |  149 ++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c       |   16 +++++
 3 files changed, 171 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 478e07c..4de72eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1984,6 +1984,13 @@ int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+extern unsigned int sysctl_sched_deadline_period;
+extern int sysctl_sched_deadline_runtime;
+
+int sched_deadline_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
 extern unsigned int sysctl_sched_compat_yield;
 
 #ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c3e834..d8b6354 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -870,6 +870,34 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+/*
+ * deadline_runtime/deadline_period is the maximum bandwidth
+ * -deadline tasks can use. It is system wide, i.e., the sum
+ * of the bandwidths of all the tasks, inside every group and
+ * running on any CPU, has to stay below this value!
+ *
+ * default: 0s (= no bandwidth for -deadline tasks)
+ */
+unsigned int sysctl_sched_deadline_period = 0;
+int sysctl_sched_deadline_runtime = 0;
+
+static inline u64 global_deadline_period(void)
+{
+	return (u64)sysctl_sched_deadline_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_deadline_runtime(void)
+{
+	return (u64)sysctl_sched_deadline_runtime * NSEC_PER_USEC;
+}
+
+/*
+ * locking for the system wide deadline bandwidth management.
+ */
+static DEFINE_MUTEX(deadline_constraints_mutex);
+static DEFINE_SPINLOCK(__sysctl_sched_deadline_lock);
+static u64 __sysctl_sched_deadline_total_bw;
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -2606,6 +2634,66 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 	return div64_u64(runtime << 20, period);
 }
 
+static inline
+void __deadline_clear_task_bw(struct task_struct *p, u64 tsk_bw)
+{
+	__sysctl_sched_deadline_total_bw -= tsk_bw;
+}
+
+static inline
+void __deadline_add_task_bw(struct task_struct *p, u64 tsk_bw)
+{
+	__sysctl_sched_deadline_total_bw += tsk_bw;
+}
+
+/*
+ * update the total allocated bandwidth, if a new -deadline task arrives,
+ * leaves or stays, but modifies its bandwidth.
+ */
+static int __deadline_check_task_bw(struct task_struct *p, int policy,
+				    struct sched_param_ex *param_ex)
+{
+	u64 bw, tsk_bw;
+	int ret = 0;
+
+	spin_lock(&__sysctl_sched_deadline_lock);
+
+	if (sysctl_sched_deadline_period <= 0)
+		goto unlock;
+
+	bw = to_ratio(sysctl_sched_deadline_period,
+		      sysctl_sched_deadline_runtime);
+	if (bw <= 0)
+		return 0;
+
+	if (deadline_policy(policy))
+		tsk_bw = to_ratio(timespec_to_ns(&param_ex->sched_deadline),
+				  timespec_to_ns(&param_ex->sched_runtime));
+
+	/*
+	 * Either if a task, enters, leave, or stays deadline but chanes
+	 * its parameters, we need to update accordingly the global
+	 * deadline allocated bandwidth.
+	 */
+	if (task_has_deadline_policy(p) && !deadline_policy(policy)) {
+		__deadline_clear_task_bw(p, p->dl.bw);
+		ret = 1;
+	} else if (task_has_deadline_policy(p) && deadline_policy(policy) &&
+		  bw >= __sysctl_sched_deadline_total_bw - p->dl.bw + tsk_bw) {
+		__deadline_clear_task_bw(p, p->dl.bw);
+		__deadline_add_task_bw(p, tsk_bw);
+		ret = 1;
+	} else if (deadline_policy(policy) && !task_has_deadline_policy(p) &&
+		   bw >= __sysctl_sched_deadline_total_bw + tsk_bw) {
+		__deadline_add_task_bw(p, tsk_bw);
+		ret = 1;
+	}
+unlock:
+	spin_unlock(&__sysctl_sched_deadline_lock);
+
+	return ret;
+}
+
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
@@ -2765,8 +2853,10 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/* a deadline task is dying: stop the bandwidth timer */
-		if (deadline_task(prev))
+		if (deadline_task(prev)) {
+			__deadline_clear_task_bw(prev, prev->dl.bw);
 			hrtimer_cancel(&prev->dl.dl_timer);
+		}
 
 		/*
 		 * Remove function-return probe instances associated with this
@@ -6372,6 +6462,19 @@ recheck:
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
+	/*
+	 * If changing to SCHED_DEADLINE (or changing the parameters of a
+	 * SCHED_DEADLINE task) we need to check if enough bandwidth is
+	 * available, which might be not true!
+	 */
+	if (deadline_policy(policy) || deadline_task(p)) {
+		if (!__deadline_check_task_bw(p, policy, param_ex)) {
+			__task_rq_unlock(rq);
+			spin_unlock_irqrestore(&p->pi_lock, flags);
+			return -EPERM;
+		}
+	}
+
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
@@ -10569,6 +10672,25 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+static int sched_deadline_global_constraints(void)
+{
+	u64 bw;
+	int ret = 1;
+
+	spin_lock_irq(&__sysctl_sched_deadline_lock);
+	if (sysctl_sched_deadline_period <= 0)
+		bw = 0;
+	else
+		bw = to_ratio(global_deadline_period(),
+			      global_deadline_runtime());
+
+	if (bw < __sysctl_sched_deadline_total_bw)
+		ret = 0;
+	spin_unlock_irq(&__sysctl_sched_deadline_lock);
+
+	return ret;
+}
+
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -10599,6 +10721,31 @@ int sched_rt_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
+int sched_deadline_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	int old_period, old_runtime;
+
+	mutex_lock(&deadline_constraints_mutex);
+	old_period = sysctl_sched_deadline_period;
+	old_runtime = sysctl_sched_deadline_runtime;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		if (!sched_deadline_global_constraints()) {
+			sysctl_sched_deadline_period = old_period;
+			sysctl_sched_deadline_runtime = old_runtime;
+			ret = -EINVAL;
+		}
+	}
+	mutex_unlock(&deadline_constraints_mutex);
+
+	return ret;
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 
 /* return corresponding task_group object of a cgroup */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c5..34117f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -373,6 +373,22 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_deadline_period_us",
+		.data		= &sysctl_sched_deadline_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &sched_deadline_handler,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_deadline_runtime_us",
+		.data		= &sysctl_sched_deadline_runtime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &sched_deadline_handler,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_compat_yield",
 		.data		= &sysctl_sched_compat_yield,
 		.maxlen		= sizeof(unsigned int),
-- 
1.6.0.4


-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
----------------------------------------------------------------------
Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa  (Italy)

http://blog.linux.it/raistlin / raistlin@ekiga.net /
dario.faggioli@jabber.org

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

next prev parent reply	other threads:[~2009-10-16 15:46 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-10-16 15:35 [RFC 0/12][PATCH] SCHED_DEADLINE (new version of SCHED_EDF) Raistlin
2009-10-16 15:38 ` [RFC 1/12][PATCH] Extended scheduling parameters structure added Raistlin
2009-12-29 12:15   ` Peter Zijlstra
2010-01-13 10:36     ` Raistlin
2009-10-16 15:40 ` [RFC 0/12][PATCH] SCHED_DEADLINE: core of the scheduling class Raistlin
2009-12-29 12:25   ` Peter Zijlstra
2010-01-13 10:40     ` Dario Faggioli
2009-12-29 12:27   ` Peter Zijlstra
2010-01-13 10:42     ` Raistlin
2009-12-29 14:30   ` Peter Zijlstra
2009-12-29 14:37     ` Peter Zijlstra
2009-12-29 14:40       ` Peter Zijlstra
2010-01-13 16:32     ` Dario Faggioli
2010-01-13 16:47       ` Peter Zijlstra
2009-12-29 14:41   ` Peter Zijlstra
2010-01-13 10:46     ` Raistlin
2009-10-16 15:41 ` [RFC 0/12][PATCH] SCHED_DEADLINE: fork and terminate task logic Raistlin
2009-12-29 15:20   ` Peter Zijlstra
2010-01-13 11:11     ` Raistlin
2010-01-13 16:15       ` Peter Zijlstra
2010-01-13 16:28         ` Dario Faggioli
2010-01-13 21:30         ` Fabio Checconi
2009-10-16 15:41 ` [RFC 0/12][PATCH] SCHED_DEADLINE: added sched_*_ex syscalls Raistlin
2009-10-16 15:42 ` [RFC 0/12][PATCH] SCHED_DEADLINE: added sched-debug support Raistlin
2009-10-16 15:43 ` [RFC 6/12][PATCH] SCHED_DEADLINE: added scheduling latency tracer Raistlin
2009-10-16 15:44 ` [RFC 7/12][PATCH] SCHED_DEADLINE: signal delivery when overrunning Raistlin
2009-12-28 14:19   ` Peter Zijlstra
2010-01-13  9:30     ` Raistlin
2009-10-16 15:44 ` [RFC 8/12][PATCH] SCHED_DEADLINE: wait next instance syscall added Raistlin
2009-12-28 14:30   ` Peter Zijlstra
2010-01-13  9:33     ` Raistlin
2009-10-16 15:45 ` Raistlin [this message]
2009-11-06 11:34   ` [RFC 9/12][PATCH] SCHED_DEADLINE: system wide bandwidth management Dhaval Giani
2009-12-28 14:44   ` Peter Zijlstra
2010-01-13  9:41     ` Raistlin
2009-10-16 15:46 ` [RFC 10/12][PATCH] SCHED_DEADLINE: group bandwidth management code Raistlin
2009-12-28 14:51   ` Peter Zijlstra
2010-01-13  9:46     ` Raistlin
2009-10-16 15:47 ` [RFC 11/12][PATCH] SCHED_DEADLINE: documentation Raistlin
2009-10-16 15:48 ` [RFC 12/12][PATCH] SCHED_DEADLINE: modified sched_*_ex API Raistlin
2009-12-28 15:09   ` Peter Zijlstra
2010-01-13 10:27     ` Raistlin
2010-01-13 16:23       ` Peter Zijlstra
2009-12-29 12:15   ` Peter Zijlstra
2010-01-13 10:33     ` Raistlin

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:478e07c dfblob:4de72eb dfblob:3c3e834 dfblob:d8b6354
dfblob:0d949c5 dfblob:34117f9 )
 OR (
bs:"[RFC 9/12][PATCH] SCHED_DEADLINE: system wide bandwidth management" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1255707940.6228.464.camel@Palantir \
    --to=raistlin@linux.it \
    --cc=bbb@cs.unc.edu \
    --cc=cfriesen@nortel.com \
    --cc=darren@dvhart.com \
    --cc=dhaval.giani@gmail.com \
    --cc=fabio@gandalf.sssup.it \
    --cc=fweisbec@gmail.com \
    --cc=giuseppe.lipari@sssup.it \
    --cc=henrik@austad.us \
    --cc=johan.eker@ericsson.com \
    --cc=juri.lelli@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=michael@evidence.eu.com \
    --cc=mingo@elte.hu \
    --cc=p.faure@akatech.ch \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=sven@thebigcorporation.com \
    --cc=tglx@linutronix.de \
    --cc=tommaso.cucinotta@sssup.it \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox