[PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked
@ 2013-02-11 21:31 Kirill Tkhai
  2013-02-12  7:06 ` Mike Galbraith
  0 siblings, 1 reply; 5+ messages in thread
From: Kirill Tkhai @ 2013-02-11 21:31 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org
  Cc: Steven Rostedt, Ingo Molnar, Peter Zijlstra, linux-rt-users

It's possible a situation when rq->rt is throttled or
it has no child entities and there are RT tasks ready
for execution in the rq which are the only tasks
of TASK_RUNNING state. In this case pick_next_task
takes idle tasks and idle wastes cpu time.

The patch change logic of pre_schedule a little bit.
We are looking at the rq's tasks and if there is a
case described bellow we unthrottle the highest RT
task (its rt_rq) of the rq if it is available. He
receives a rt_time equal to 'rt_runtime-1' and the
time spent at 'manually unthrottled' condition is
not accounted. The manually unthrottled task is
preempted by any other task when the new task of
any class will be available.

Signed-off-by: Kirill V Tkhai <tkhai@yandex.ru>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ingo Molnar <mingo@kernel.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: linux-rt-users <linux-rt-users@vger.kernel.org>
---
 kernel/sched/core.c  |   22 +++++---
 kernel/sched/fair.c  |    2 +-
 kernel/sched/rt.c    |  143 ++++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h |    5 +-
 4 files changed, 135 insertions(+), 37 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55a5ae3..a77447d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -910,7 +910,9 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	const struct sched_class *class;
 
-	if (p->sched_class == rq->curr->sched_class) {
+	if (unlikely(rq->rt_man_unthrottle)) {
+		resched_task(rq->curr);
+	} else if (p->sched_class == rq->curr->sched_class) {
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 	} else {
 		for_each_class(class) {
@@ -1860,15 +1862,23 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	}
 }
 
-#ifdef CONFIG_SMP
-
 /* assumes rq->lock is held */
 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
+	struct rt_rq *rt_rq = &rq->rt;
+	unsigned long rt_total = rt_rq->rt_nr_total;
+
+#ifdef CONFIG_SMP
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
+#endif
+	/* The rq has only RT tasks and they are available */
+	if (rt_total == rq->nr_running && rt_total)
+		check_rt_rq_throttled(rq);
 }
 
+#ifdef CONFIG_SMP
+
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -1886,10 +1896,6 @@ static inline void post_schedule(struct rq *rq)
 
 #else
 
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -2933,6 +2939,7 @@ need_resched:
 		switch_count = &prev->nvcsw;
 	}
 
+	rq->rt_man_unthrottle = 0;
 	pre_schedule(rq, prev);
 
 	if (unlikely(!rq->nr_running))
@@ -6972,6 +6979,7 @@ void __init sched_init(void)
 		rq->nohz_flags = 0;
 #endif
 #endif
+		rq->rt_man_unthrottle = 0;
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..7d98302 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2309,7 +2309,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		rq->nr_running += task_delta;
 
 	/* determine whether we need to wake up potentially idle cpu */
-	if (rq->curr == rq->idle && rq->cfs.nr_running)
+	if ((rq->curr == rq->idle || rq->rt_man_unthrottle) && rq->cfs.nr_running)
 		resched_task(rq->curr);
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 839718d..1cbe5dc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,15 +274,8 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	struct task_struct *p;
-
-	if (!rt_entity_is_task(rt_se))
-		return;
-
-	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+	struct task_struct *p = rt_task_of(rt_se);
 
-	rt_rq->rt_nr_total++;
 	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory++;
 
@@ -291,15 +284,8 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	struct task_struct *p;
-
-	if (!rt_entity_is_task(rt_se))
-		return;
-
-	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+	struct task_struct *p = rt_task_of(rt_se);
 
-	rt_rq->rt_nr_total--;
 	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory--;
 
@@ -783,6 +769,31 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 }
 #endif /* CONFIG_SMP */
 
+static void check_new_entity_available(struct rq *rq, struct rt_rq *rt_rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_rt_entity *rt_se = &curr->rt;
+
+	if (rt_rq->rt_nr_running == 0)
+		return;
+
+	for_each_sched_rt_entity(rt_se) {
+		/* rt_rq is an entity of rt stack of curr */
+		if (rt_rq_of_se(rt_se) == rt_rq) {
+			/*
+			 * We don't know which entity of the stack was
+			 * throttled during check_rt_rq_throttled().
+			 */
+			rq->rt_man_unthrottle = 0;
+			if (rt_rq->rt_nr_running == 1)
+				return;
+			break;
+		}
+	}
+
+	resched_task(curr);
+}
+
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
 	int i, idle = 1, throttled = 0;
@@ -837,6 +848,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		}
 		if (rt_rq->rt_throttled)
 			throttled = 1;
+		else if (rq->rt_man_unthrottle)
+			check_new_entity_available(rq, rt_rq);
 
 		if (enqueue)
 			sched_rt_rq_enqueue(rt_rq);
@@ -939,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
-	if (!rt_bandwidth_enabled())
+	if (!rt_bandwidth_enabled() || rq->rt_man_unthrottle)
 		return;
 
 	for_each_sched_rt_entity(rt_se) {
@@ -1071,8 +1084,14 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running++;
 
+	if (rt_entity_is_task(rt_se)) {
+		struct rt_rq *rt = &rq_of_rt_rq(rt_rq)->rt;
+
+		rt->rt_nr_total++;
+		inc_rt_migration(rt_se, rt);
+	}
+
 	inc_rt_prio(rt_rq, prio);
-	inc_rt_migration(rt_se, rt_rq);
 	inc_rt_group(rt_se, rt_rq);
 }
 
@@ -1083,8 +1102,15 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
 
+	if (rt_entity_is_task(rt_se)) {
+		struct rt_rq *rt = &rq_of_rt_rq(rt_rq)->rt;
+
+		WARN_ON(!rt->rt_nr_total);
+		rt->rt_nr_total--;
+		dec_rt_migration(rt_se, rt);
+	}
+
 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
-	dec_rt_migration(rt_se, rt_rq);
 	dec_rt_group(rt_se, rt_rq);
 }
 
@@ -1419,17 +1445,15 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 		enqueue_pushable_task(rq, p);
 }
 
-#ifdef CONFIG_SMP
-
-/* Only try algorithms three times */
-#define RT_MAX_TRIES 3
-
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
-	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-		return 1;
-	return 0;
+	if (task_running(rq, p))
+		return 0;
+#ifdef CONFIG_SMP
+	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+		return 0;
+#endif
+	return 1;
 }
 
 /* Return the second highest RT task, NULL otherwise */
@@ -1470,6 +1494,11 @@ next_idx:
 	return next;
 }
 
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
 static int find_lowest_rq(struct task_struct *task)
@@ -1906,6 +1935,64 @@ void init_sched_rt_class(void)
 }
 #endif /* CONFIG_SMP */
 
+static void unthrottle_single_rt_rq(struct rt_rq *rt_rq)
+{
+	u64 runtime;
+
+	raw_spin_lock(&rt_rq->rt_runtime_lock);
+
+	rt_rq->rt_throttled = 0;
+
+	runtime = sched_rt_runtime(rt_rq);
+	WARN_ON(runtime == RUNTIME_INF);
+
+	rt_rq->rt_time = min(rt_rq->rt_time, runtime - 1);
+
+	raw_spin_unlock(&rt_rq->rt_runtime_lock);
+}
+
+static void unthrottle_highest_rt_rq(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr, *p;
+	struct sched_rt_entity *rt_se;
+
+	p = pick_next_highest_task_rt(rq, cpu_of(rq));
+
+	if (!p || (curr->sched_class == &rt_sched_class &&
+		   curr->prio <= p->prio && curr->on_rq))
+		p = curr;
+
+	rt_se = &p->rt;
+
+	dequeue_rt_stack(rt_se);
+
+	for_each_sched_rt_entity(rt_se) {
+		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+		/* Unthrottle parent rt_rq */
+		if (rt_rq->rt_throttled)
+			unthrottle_single_rt_rq(rt_rq);
+
+		/* Enqueue on parent rt_rq */
+		__enqueue_rt_entity(rt_se, true);
+	}
+}
+
+void check_rt_rq_throttled(struct rq *rq)
+{
+	struct rt_rq *rt_rq = &rq->rt;
+
+	/* Do update here to recognize if rt_rq is throttled */
+	update_curr_rt(rq);
+
+	/* The rt_rq is throttled or all of its children are dequeued */
+	if (unlikely(rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)) {
+		unthrottle_highest_rt_rq(rq);
+		rq->rt_man_unthrottle = 1;
+	}
+
+}
+
 /*
  * When switching a task to RT, we may overload the runqueue
  * with RT tasks. In this case we try to push them off to
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc88644..d7a49b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -293,6 +293,7 @@ static inline int rt_bandwidth_enabled(void)
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned int rt_nr_running;
+	unsigned long rt_nr_total;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int curr; /* highest queued rt task prio */
@@ -303,7 +304,6 @@ struct rt_rq {
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
-	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
 #endif
@@ -375,6 +375,8 @@ struct rq {
 	unsigned long nohz_flags;
 #endif
 	int skip_clock_update;
+	/* rt rq was manually unthrottled */
+	int rt_man_unthrottle;
 
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
@@ -782,6 +784,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
+extern void check_rt_rq_throttled(struct rq *rq);
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked
  2013-02-11 21:31 [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked Kirill Tkhai
@ 2013-02-12  7:06 ` Mike Galbraith
  2013-02-12  8:12   ` Stanislav Meduna
  2013-02-12 14:46   ` Steven Rostedt
  0 siblings, 2 replies; 5+ messages in thread
From: Mike Galbraith @ 2013-02-12  7:06 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel@vger.kernel.org, Steven Rostedt, Ingo Molnar,
	Peter Zijlstra, linux-rt-users

On Tue, 2013-02-12 at 01:31 +0400, Kirill Tkhai wrote: 
> It's possible a situation when rq->rt is throttled or
> it has no child entities and there are RT tasks ready
> for execution in the rq which are the only tasks
> of TASK_RUNNING state. In this case pick_next_task
> takes idle tasks and idle wastes cpu time.

That's not a waste of CPU time, that's utilization enforcement the thing
it is designed to do.  I'd rather see borrowing go away, bodyguard being
either a ruthless thug who may save your bacon, or unemployed.  AFAIKT,
the throttle is useless in any other role, just having it standing
around glaring across the room occasionally can turn sensitive realtime
into a quaking heap of jello.

-Mike

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked
  2013-02-12  7:06 ` Mike Galbraith
@ 2013-02-12  8:12   ` Stanislav Meduna
  2013-02-12 12:15     ` Mike Galbraith
  2013-02-12 14:46   ` Steven Rostedt
  1 sibling, 1 reply; 5+ messages in thread
From: Stanislav Meduna @ 2013-02-12  8:12 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org; +Cc: Ingo Molnar, Peter Zijlstra, linux-rt-users

On 12.02.2013 08:06, Mike Galbraith wrote:

>> In this case pick_next_task takes idle tasks and idle wastes cpu
>> time.

> That's not a waste of CPU time, that's utilization enforcement the thing
> it is designed to do.

Well this is a philosophical question and the opinions will IMHO
vary strongly. If the throttling kicks in, the system already
is in the out-of-spec state. Is the goal now just to allow
e.g. the ssh login to be able to kill the task and still try
to do the best if otherwise (possibly masking the problem for
months), or is it to enforce the utilization?

For example we have a PLC software where the end-user develops
an application that will be executed in our realtime task.
The application usually has a longer initialization part where
the excess utilization can happen and should be tolerated
and the running part where it is a bug if it happens. Here
I would prefer the throttling to alert the user, but not
to actually throttle if there is no non-RT task actually
wanting to run. In other cases I would maybe prefer even
killing the task, alerting the user to the fact.

I have a related question: is the information that the throttling
happened available somewhere except the log (where it gets only
written once)? If not, would a patch exporting the count
of throttlings via /sys be accepted?

My problem is that I would like to know that the throttling
happened right now and display it to the user.

Regards
-- 
                                        Stano

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked
  2013-02-12  8:12   ` Stanislav Meduna
@ 2013-02-12 12:15     ` Mike Galbraith
  0 siblings, 0 replies; 5+ messages in thread
From: Mike Galbraith @ 2013-02-12 12:15 UTC (permalink / raw)
  To: Stanislav Meduna
  Cc: linux-kernel@vger.kernel.org, Ingo Molnar, Peter Zijlstra,
	linux-rt-users

On Tue, 2013-02-12 at 09:12 +0100, Stanislav Meduna wrote: 
> On 12.02.2013 08:06, Mike Galbraith wrote:
> 
> >> In this case pick_next_task takes idle tasks and idle wastes cpu
> >> time.
> 
> > That's not a waste of CPU time, that's utilization enforcement the thing
> > it is designed to do.
> 
> Well this is a philosophical question and the opinions will IMHO
> vary strongly. If the throttling kicks in, the system already
> is in the out-of-spec state.

Exactly, please don't feed the wild eyed psychopaths ;-)

> Is the goal now just to allow
> e.g. the ssh login to be able to kill the task and still try
> to do the best if otherwise (possibly masking the problem for
> months), or is it to enforce the utilization?

Both.  It has two modes of enforcement, sane mode is I WILL constrain
this thing you turned loose should it acts up, and not so sane mode,
where borrowing a cup of CPU from the neighbors is ok.  Workqueues.

> For example we have a PLC software where the end-user develops
> an application that will be executed in our realtime task.
> The application usually has a longer initialization part where
> the excess utilization can happen and should be tolerated
> and the running part where it is a bug if it happens. Here
> I would prefer the throttling to alert the user, but not
> to actually throttle if there is no non-RT task actually
> wanting to run. In other cases I would maybe prefer even
> killing the task, alerting the user to the fact.

That's not in the throttles job description.  It's not a monitor and
report system, it's a constraint system for very dangerous beasts.

> I have a related question: is the information that the throttling
> happened available somewhere except the log (where it gets only
> written once)? If not, would a patch exporting the count
> of throttlings via /sys be accepted?

I'm not the maintainer, so can't say.  Seems to me a trace point would
be better though.

-Mike


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked
  2013-02-12  7:06 ` Mike Galbraith
  2013-02-12  8:12   ` Stanislav Meduna
@ 2013-02-12 14:46   ` Steven Rostedt
  1 sibling, 0 replies; 5+ messages in thread
From: Steven Rostedt @ 2013-02-12 14:46 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Kirill Tkhai, linux-kernel@vger.kernel.org, Ingo Molnar,
	Peter Zijlstra, linux-rt-users

On Tue, 2013-02-12 at 08:06 +0100, Mike Galbraith wrote:
> On Tue, 2013-02-12 at 01:31 +0400, Kirill Tkhai wrote: 
> > It's possible a situation when rq->rt is throttled or
> > it has no child entities and there are RT tasks ready
> > for execution in the rq which are the only tasks
> > of TASK_RUNNING state. In this case pick_next_task
> > takes idle tasks and idle wastes cpu time.
> 
> That's not a waste of CPU time, that's utilization enforcement the thing
> it is designed to do.  I'd rather see borrowing go away, bodyguard being
> either a ruthless thug who may save your bacon, or unemployed.  AFAIKT,
> the throttle is useless in any other role, just having it standing
> around glaring across the room occasionally can turn sensitive realtime
> into a quaking heap of jello.

I guess what Mike is saying is, you can do a bunch of tests, and if it
happens to throttle, you wont notice because the system happened to be
idle.

Now if you are running in production, and hit the throttle, where
there's tasks to run, you're production system just went *THUNK*.

Now in argument for the "borrowing" of idle, I do believe that the
throttle does still occur, and you get a nasty output on the console.
But not everyone reads those.

-- Steve

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2013-02-12 14:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-02-11 21:31 [PATCH] sched/rt: Unthrottle the highest RT task of the rq if there are no another available tasks to be picked Kirill Tkhai
2013-02-12  7:06 ` Mike Galbraith
2013-02-12  8:12   ` Stanislav Meduna
2013-02-12 12:15     ` Mike Galbraith
2013-02-12 14:46   ` Steven Rostedt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.