Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Juri Lelli <juri.lelli@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>,
	luca abeni <luca.abeni@unitn.it>,
	linux-kernel@vger.kernel.org, mingo@redhat.com,
	vincent.guittot@linaro.org, wanpeng.li@hotmail.com
Subject: Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth
Date: Thu, 24 Mar 2016 10:20:47 +0100	[thread overview]
Message-ID: <20160324092047.GA6375@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <20160225102034.GE6357@twins.programming.kicks-ass.net>

On Thu, Feb 25, 2016 at 11:20:34AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 25, 2016 at 10:07:06AM +0000, Juri Lelli wrote:
> > Argh, this makes lot of sense to me. I've actually pondered a tree/list
> > solution, but then decided to try the cumulative approach because it
> > looked nicer. But it contains holes, I'm afraid. As Luca already said,
> > GRUB shouldn't have these problems though.
> > 
> > I'll try and see what introducting a list of blocked/throttled deadline
> > tasks means, considering also the interaction with cpusets and such.
> > Maybe it's simpler than it seems.
> > 
> > I'm not sure this will come anytime soon, unfortunately. I'm almost 100%
> > on the sched-freq/schedutil discussion these days.
> 
> Just skip sleep and write them when its dark outside :-)
> 
> > Anyway, do you also think that what we want to solve the root domain
> > issue is something based on rq_online/offline and per-rq information?
> > Everything else that I tried or thought of was broken/more horrible. :-/
> 
> I was still trying to get my head around this, the above was my
> suggestion to the per-rq state, but I've not thought hard on alternative
> approaches to the root_domain issue.

So the below is the inactive list; it seems to not insta-explode when I
run a few simple dl proglets.

I don't particularly like it because it makes wakeups (esp. cross-cpu
ones) more expensive for the benefit of hotplug/cpusets which is
something that 'never' happens.

So what I'm going to try and do is forget all about this here patch and
see what I can do with a full task-list iteration on rebuild. But I
figured that since I wrote it and it might work, I might as well post
it.

---
 include/linux/sched.h   |   5 ++
 kernel/sched/core.c     |   6 ++-
 kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/fair.c     |   2 +-
 kernel/sched/sched.h    |   7 ++-
 5 files changed, 132 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea12c6b7..d9848eac35f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,6 +1351,11 @@ struct sched_dl_entity {
 	 * own bandwidth to be enforced, thus we need one timer per task.
 	 */
 	struct hrtimer dl_timer;
+
+#ifdef CONFIG_SMP
+	struct list_head dl_inactive_entry;
+	int		 dl_inactive_cpu;
+#endif
 };
 
 union rcu_special {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a724e1..7f3fab6349a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1162,7 +1162,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
-			p->sched_class->migrate_task_rq(p);
+			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 	}
@@ -2077,6 +2077,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);
+#ifdef CONFIG_SMP
+	INIT_LIST_HEAD(&p->dl.dl_inactive_entry);
+#endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->rt.timeout		= 0;
@@ -5397,6 +5400,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		migrate_inactive_dl(rq);
 		break;
 
 	case CPU_DEAD:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036facbe1..f999b8bb6fea 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -80,6 +80,9 @@ void init_dl_rq(struct dl_rq *dl_rq)
 	dl_rq->dl_nr_migratory = 0;
 	dl_rq->overloaded = 0;
 	dl_rq->pushable_dl_tasks_root = RB_ROOT;
+
+	raw_spin_lock_init(&dl_rq->dl_inactive_lock);
+	INIT_LIST_HEAD(&dl_rq->dl_inactive_list);
 #else
 	init_dl_bw(&dl_rq->dl_bw);
 #endif
@@ -289,6 +292,62 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 	return later_rq;
 }
 
+static void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+	raw_spin_lock(&dl_rq->dl_inactive_lock);
+	WRITE_ONCE(dl_se->dl_inactive_cpu, rq_of_dl_rq(dl_rq)->cpu);
+	list_add(&dl_se->dl_inactive_entry, &dl_rq->dl_inactive_list);
+	raw_spin_unlock(&dl_rq->dl_inactive_lock);
+}
+
+static void dequeue_inactive(struct sched_dl_entity *dl_se)
+{
+	int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+	struct rq *rq;
+
+again:
+	if (cpu == -1)
+		return;
+	rq = cpu_rq(cpu);
+
+	raw_spin_lock(&rq->dl.dl_inactive_lock);
+	tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+	if (cpu != tmp) {
+		cpu = tmp;
+		raw_spin_unlock(&rq->dl.dl_inactive_lock);
+		goto again;
+	}
+	list_del_init(&dl_se->dl_inactive_entry);
+	WRITE_ONCE(dl_se->dl_inactive_cpu, -1);
+	raw_spin_unlock(&rq->dl.dl_inactive_lock);
+}
+
+static void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu)
+{
+	int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+	struct rq *src_rq, *dst_rq;
+
+	dst_rq = cpu_rq(new_cpu);
+again:
+	if (cpu == -1)
+		return;
+	src_rq = cpu_rq(cpu);
+
+	double_raw_lock(&src_rq->dl.dl_inactive_lock,
+			&dst_rq->dl.dl_inactive_lock);
+	tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+	if (cpu != tmp) {
+		cpu = tmp;
+		raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+		raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+		goto again;
+	}
+	list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+	WRITE_ONCE(dl_se->dl_inactive_cpu, new_cpu);
+	raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+	raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
 #else
 
 static inline
@@ -327,6 +386,11 @@ static inline void queue_push_tasks(struct rq *rq)
 static inline void queue_pull_task(struct rq *rq)
 {
 }
+
+static inline void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { }
+static inline void dequeue_inactive(struct sched_dl_entity *dl_se) { }
+static inline void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu) { }
+
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -960,6 +1024,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
 		return;
 
+	if (!(flags & ENQUEUE_RESTORE))
+		dequeue_inactive(&p->dl);
+
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -970,6 +1037,8 @@ static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	dequeue_dl_entity(&p->dl);
 	dequeue_pushable_dl_task(rq, p);
+	if (!(flags & DEQUEUE_SAVE))
+		enqueue_inactive(&p->dl, &rq->dl);
 }
 
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1074,6 +1143,34 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+
+	if (list_empty(&dl_se->dl_inactive_entry))
+		return;
+
+	migrate_inactive(dl_se, new_cpu);
+}
+
+void migrate_inactive_dl(struct rq *src_rq)
+{
+	int cpu = cpumask_any_and(src_rq->rd->online, cpu_active_mask);
+	struct rq *dst_rq = cpu_rq(cpu);
+	struct sched_dl_entity *dl_se, *tmp;
+
+	double_raw_lock(&src_rq->dl.dl_inactive_lock,
+			&dst_rq->dl.dl_inactive_lock);
+
+	list_for_each_entry_safe(dl_se, tmp, &src_rq->dl.dl_inactive_list, dl_inactive_entry) {
+		WRITE_ONCE(dl_se->dl_inactive_cpu, cpu);
+		list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+	}
+
+	raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+	raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -1211,13 +1308,19 @@ static void task_dead_dl(struct task_struct *p)
 {
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
+	local_irq_disable();
+
 	/*
 	 * Since we are TASK_DEAD we won't slip out of the domain!
 	 */
-	raw_spin_lock_irq(&dl_b->lock);
+	raw_spin_lock(&dl_b->lock);
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
-	raw_spin_unlock_irq(&dl_b->lock);
+	raw_spin_unlock(&dl_b->lock);
+
+	dequeue_inactive(&p->dl);
+
+	local_irq_enable();
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1702,7 +1805,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * this is the right place to try to pull some other one
 	 * from an overloaded cpu, if any.
 	 */
-	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+	if (!task_on_rq_queued(p)) {
+		dequeue_inactive(&p->dl);
+		return;
+	}
+
+	if (rq->dl.dl_nr_running)
 		return;
 
 	queue_pull_task(rq);
@@ -1728,6 +1836,9 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 			resched_curr(rq);
 #endif
 	}
+
+	if (!task_on_rq_queued(p))
+		enqueue_inactive(&p->dl, &rq->dl);
 }
 
 /*
@@ -1779,6 +1890,7 @@ const struct sched_class dl_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_dl,
+	.migrate_task_rq	= migrate_task_rq_dl,
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 303d6392b389..04e856a85c0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5231,7 +5231,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
 	/*
 	 * We are supposed to update the task to "current" time, then its up to date
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6d4a3fa3660..0de1e2894d22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,9 @@ struct dl_rq {
 	 */
 	struct rb_root pushable_dl_tasks_root;
 	struct rb_node *pushable_dl_tasks_leftmost;
+
+	raw_spinlock_t   dl_inactive_lock;
+	struct list_head dl_inactive_list;
 #else
 	struct dl_bw dl_bw;
 #endif
@@ -776,6 +779,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
 
 #ifdef CONFIG_SMP
 
+extern void migrate_inactive_dl(struct rq *src_rq);
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1205,7 +1210,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-	void (*migrate_task_rq)(struct task_struct *p);
+	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
 
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);

next prev parent reply	other threads:[~2016-03-24  9:21 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-08 12:45 [PATCH 0/2] sched/deadline: fix cpusets bandwidth accounting Juri Lelli
2016-02-08 12:45 ` [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth Juri Lelli
2016-02-10 11:32   ` Juri Lelli
2016-02-10 11:43     ` luca abeni
2016-02-10 11:58       ` Juri Lelli
2016-02-19 13:43         ` luca abeni
2016-02-19 14:20           ` Steven Rostedt
2016-02-19 14:53             ` luca abeni
2016-02-19 14:57               ` Steven Rostedt
2016-02-22 11:03               ` luca abeni
2016-02-22 10:57         ` [PATCH 0/3] cleanup " Luca Abeni
2016-02-22 10:57           ` [PATCH 1/4] Move some calls to __dl_{sub,add}_ac() from core.c to deadline.c Luca Abeni
2016-02-22 10:57           ` [PATCH 2/4] Move the remaining __dl_{sub,add}_ac() calls " Luca Abeni
2016-02-22 10:57           ` [PATCH 3/4] Remove dl_new Luca Abeni
2016-02-23 15:42             ` Peter Zijlstra
2016-02-24 13:53               ` luca abeni
2016-02-25  9:46                 ` Juri Lelli
2016-03-03  9:03                   ` luca abeni
2016-03-03  9:28                     ` Juri Lelli
2016-03-03 14:23                       ` Steven Rostedt
2016-03-03 14:31                         ` luca abeni
2016-03-03 16:12                         ` Juri Lelli
2016-02-10 12:48     ` [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth luca abeni
2016-02-10 13:42       ` Juri Lelli
2016-02-23 15:48         ` Peter Zijlstra
2016-02-23 15:51           ` Juri Lelli
2016-02-10 14:37     ` Steven Rostedt
2016-02-10 16:27       ` Juri Lelli
2016-02-11 12:12         ` Juri Lelli
2016-02-11 12:22           ` luca abeni
2016-02-11 12:27             ` Juri Lelli
2016-02-11 12:40               ` luca abeni
2016-02-11 12:49                 ` Juri Lelli
2016-02-11 13:05                   ` luca abeni
2016-02-11 14:25                     ` Steven Rostedt
2016-02-11 17:10                       ` Juri Lelli
2016-02-12 17:05                         ` Peter Zijlstra
2016-02-12 17:19                           ` Juri Lelli
2016-02-24 19:17                           ` Peter Zijlstra
2016-02-24 21:46                             ` luca abeni
2016-02-25  7:53                               ` Peter Zijlstra
2016-02-25 10:07                             ` Juri Lelli
2016-02-25 10:20                               ` Peter Zijlstra
2016-03-24  9:20                                 ` Peter Zijlstra [this message]
2016-02-11 21:48                       ` Luca Abeni
2016-02-08 12:45 ` [PATCH 2/2] sched/deadline: rq_{online,offline}_dl for root_domain changes Juri Lelli

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c617ea12c6b dfblob:d9848eac35f dfblob:0b21e7a724e
dfblob:7f3fab6349a dfblob:c7a036facbe dfblob:f999b8bb6fe
dfblob:303d6392b38 dfblob:04e856a85c0 dfblob:e6d4a3fa366
dfblob:0de1e2894d2 )
 OR (
bs:"Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160324092047.GA6375@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=juri.lelli@arm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luca.abeni@unitn.it \
    --cc=mingo@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=wanpeng.li@hotmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.