From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>,
efault@gmx.de, kernel@kolivas.org, containers@lists.osdl.org,
ckrm-tech@lists.sourceforge.net, torvalds@linux-foundation.org,
akpm@linux-foundation.org, pwil3058@bigpond.net.au,
tingy@cs.umass.edu, tong.n.li@intel.com, wli@holomorphy.com,
linux-kernel@vger.kernel.org, dmitry.adamushko@gmail.com,
balbir@in.ibm.com
Subject: [RFC][PATCH 4/6] Fix (bad?) interactions between SCHED_RT and SCHED_NORMAL tasks
Date: Mon, 11 Jun 2007 21:25:04 +0530 [thread overview]
Message-ID: <20070611155504.GD2109@in.ibm.com> (raw)
In-Reply-To: <20070611154724.GA32435@in.ibm.com>
Currently nr_running and raw_weighted_load fields in runqueue affect
some CFS calculations (like distribute_fair_add, enqueue_sleeper etc).
These fields however are shared between tasks of all classes, which can
potentialy affect those calculations for SCHED_NORMAL tasks. However I
do not know of any bad behaviour caused by not splitting these fields (like
this patch does).
This split is neverthless needed for subsequent patches.
Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
kernel/sched.c | 134 +++++++++++++++++++++++++---------------------------
kernel/sched_fair.c | 65 ++++++++++++++++++++++++-
2 files changed, 128 insertions(+), 71 deletions(-)
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c 2007-06-09 15:07:32.000000000 +0530
+++ current/kernel/sched.c 2007-06-09 15:07:36.000000000 +0530
@@ -118,6 +118,7 @@
/* CFS-related fields in a runqueue */
struct lrq {
+ long nr_running;
unsigned long raw_weighted_load;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
@@ -125,6 +126,7 @@
u64 fair_clock, delta_fair_clock;
u64 exec_clock, delta_exec_clock;
+ u64 last_tick; /* when did we last smoothen cpu load? */
s64 wait_runtime;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
@@ -148,12 +150,18 @@
* remote CPUs use both these fields when doing load calculation.
*/
long nr_running;
- struct lrq lrq;
+ unsigned long raw_weighted_load;
+#ifdef CONFIG_SMP
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
+#endif
+ struct lrq lrq;
+
u64 nr_switches;
/*
@@ -589,13 +597,13 @@
static inline void
inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- rq->lrq.raw_weighted_load += p->se.load_weight;
+ rq->raw_weighted_load += p->se.load_weight;
}
static inline void
dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- rq->lrq.raw_weighted_load -= p->se.load_weight;
+ rq->raw_weighted_load -= p->se.load_weight;
}
static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -741,7 +749,7 @@
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->lrq.raw_weighted_load;
+ return cpu_rq(cpu)->raw_weighted_load;
}
#ifdef CONFIG_SMP
@@ -876,9 +884,9 @@
struct rq *rq = cpu_rq(cpu);
if (type == 0)
- return rq->lrq.raw_weighted_load;
+ return rq->raw_weighted_load;
- return min(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+ return min(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
@@ -890,9 +898,9 @@
struct rq *rq = cpu_rq(cpu);
if (type == 0)
- return rq->lrq.raw_weighted_load;
+ return rq->raw_weighted_load;
- return max(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+ return max(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
@@ -903,7 +911,7 @@
struct rq *rq = cpu_rq(cpu);
unsigned long n = rq->nr_running;
- return n ? rq->lrq.raw_weighted_load / n : SCHED_LOAD_SCALE;
+ return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
}
/*
@@ -1592,54 +1600,6 @@
return running + uninterruptible;
}
-static void update_load_fair(struct rq *this_rq)
-{
- unsigned long this_load, fair_delta, exec_delta, idle_delta;
- u64 fair_delta64, exec_delta64, tmp64;
- unsigned int i, scale;
-
- this_rq->lrq.nr_load_updates++;
- if (!(sysctl_sched_features & 64)) {
- this_load = this_rq->lrq.raw_weighted_load;
- goto do_avg;
- }
-
- fair_delta64 = this_rq->lrq.delta_fair_clock + 1;
- this_rq->lrq.delta_fair_clock = 0;
-
- exec_delta64 = this_rq->lrq.delta_exec_clock + 1;
- this_rq->lrq.delta_exec_clock = 0;
-
- if (fair_delta64 > (u64)LONG_MAX)
- fair_delta64 = (u64)LONG_MAX;
- fair_delta = (unsigned long)fair_delta64;
-
- if (exec_delta64 > (u64)TICK_NSEC)
- exec_delta64 = (u64)TICK_NSEC;
- exec_delta = (unsigned long)exec_delta64;
-
- idle_delta = TICK_NSEC - exec_delta;
-
- tmp64 = SCHED_LOAD_SCALE * exec_delta64;
- do_div(tmp64, fair_delta);
- tmp64 *= exec_delta64;
- do_div(tmp64, TICK_NSEC);
- this_load = (unsigned long)tmp64;
-
-do_avg:
- /* Update our load: */
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->lrq.cpu_load[i];
- new_load = this_load;
-
- this_rq->lrq.cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
- }
-}
-
#ifdef CONFIG_SMP
/*
@@ -2003,7 +1963,7 @@
avg_load += load;
sum_nr_running += rq->nr_running;
- sum_weighted_load += rq->lrq.raw_weighted_load;
+ sum_weighted_load += rq->raw_weighted_load;
}
/*
@@ -2238,11 +2198,11 @@
rq = cpu_rq(i);
if (rq->nr_running == 1 &&
- rq->lrq.raw_weighted_load > imbalance)
+ rq->raw_weighted_load > imbalance)
continue;
- if (rq->lrq.raw_weighted_load > max_load) {
- max_load = rq->lrq.raw_weighted_load;
+ if (rq->raw_weighted_load > max_load) {
+ max_load = rq->raw_weighted_load;
busiest = rq;
}
}
@@ -2576,6 +2536,32 @@
spin_unlock(&target_rq->lock);
}
+static void update_load(struct rq *this_rq)
+{
+ unsigned long this_load;
+ unsigned int i, scale;
+
+ this_load = this_rq->raw_weighted_load;
+
+ /* Update our load: */
+ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ }
+}
+
#ifdef CONFIG_NO_HZ
static struct {
atomic_t load_balancer;
@@ -2822,14 +2808,14 @@
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
}
-#else
+#else /* CONFIG_SMP */
/*
* on UP we do not need to balance between CPUs:
*/
static inline void idle_balance(int cpu, struct rq *rq)
{
}
-#endif
+#endif /* CONFIG_SMP */
DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2953,8 +2939,8 @@
if (!idle_at_tick)
task_running_tick(rq, p);
- update_load_fair(rq);
#ifdef CONFIG_SMP
+ update_load(rq);
rq->idle_at_tick = idle_at_tick;
trigger_load_balance(cpu);
#endif
@@ -6090,6 +6076,18 @@
&& addr < (unsigned long)__sched_text_end);
}
+static inline void init_lrq(struct lrq *lrq, struct rq *rq)
+{
+ int j;
+
+ lrq->tasks_timeline = RB_ROOT;
+ lrq->fair_clock = 1;
+ lrq->last_tick = rq_clock(rq);
+ lrq->nr_running = 0;
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ lrq->cpu_load[j] = 0;
+}
+
void __init sched_init(void)
{
int highest_cpu = 0;
@@ -6110,12 +6108,12 @@
spin_lock_init(&rq->lock);
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
- rq->lrq.tasks_timeline = RB_ROOT;
- rq->clock = rq->lrq.fair_clock = 1;
+ rq->clock = 1;
+ init_lrq(&rq->lrq, rq);
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
- rq->lrq.cpu_load[j] = 0;
#ifdef CONFIG_SMP
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ rq->cpu_load[j] = 0;
rq->sd = NULL;
rq->active_balance = 0;
rq->push_cpu = 0;
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c 2007-06-09 15:07:33.000000000 +0530
+++ current/kernel/sched_fair.c 2007-06-09 15:07:36.000000000 +0530
@@ -64,9 +64,7 @@
static long lrq_nr_running(struct lrq *lrq)
{
- struct rq *rq = lrq_rq(lrq);
-
- return rq->nr_running;
+ return lrq->nr_running;
}
#define entity_is_task(se) 1
@@ -119,6 +117,8 @@
rb_link_node(&p->run_node, parent, link);
rb_insert_color(&p->run_node, &lrq->tasks_timeline);
+ lrq->raw_weighted_load += p->load_weight;
+ lrq->nr_running++;
}
static inline void __dequeue_entity(struct lrq *lrq, struct sched_entity *p)
@@ -126,6 +126,8 @@
if (lrq->rb_leftmost == &p->run_node)
lrq->rb_leftmost = NULL;
rb_erase(&p->run_node, &lrq->tasks_timeline);
+ lrq->raw_weighted_load -= p->load_weight;
+ lrq->nr_running--;
}
static inline struct rb_node * first_fair(struct lrq *lrq)
@@ -570,12 +572,69 @@
update_stats_wait_start(lrq, prev, now);
}
+static void update_load_fair(struct lrq *this_lrq)
+{
+ unsigned long this_load, fair_delta, exec_delta, idle_delta;
+ u64 fair_delta64, exec_delta64, tmp64;
+ unsigned int i, scale;
+
+ this_lrq->nr_load_updates++;
+ if (!(sysctl_sched_features & 64)) {
+ this_load = this_lrq->raw_weighted_load;
+ goto do_avg;
+ }
+
+ fair_delta64 = this_lrq->delta_fair_clock + 1;
+ this_lrq->delta_fair_clock = 0;
+
+ exec_delta64 = this_lrq->delta_exec_clock + 1;
+ this_lrq->delta_exec_clock = 0;
+
+ if (fair_delta64 > (u64)LONG_MAX)
+ fair_delta64 = (u64)LONG_MAX;
+ fair_delta = (unsigned long)fair_delta64;
+
+ if (exec_delta64 > (u64)TICK_NSEC)
+ exec_delta64 = (u64)TICK_NSEC;
+ exec_delta = (unsigned long)exec_delta64;
+
+ idle_delta = TICK_NSEC - exec_delta;
+
+ tmp64 = SCHED_LOAD_SCALE * exec_delta64;
+ do_div(tmp64, fair_delta);
+ tmp64 *= exec_delta64;
+ do_div(tmp64, TICK_NSEC);
+ this_load = (unsigned long)tmp64;
+
+do_avg:
+ /* Update our load: */
+ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_lrq->cpu_load[i];
+ new_load = this_load;
+
+ this_lrq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ }
+}
+
static void entity_tick(struct lrq *lrq, struct sched_entity *curr)
{
struct sched_entity *next;
struct rq *rq = lrq_rq(lrq);
u64 now = __rq_clock(rq);
+ /* replay load smoothening for all ticks we lost */
+ while (time_after_eq64(now, lrq->last_tick)) {
+ update_load_fair(lrq);
+ lrq->last_tick += TICK_NSEC;
+ }
+ /* deal with time wraps */
+ if (unlikely(now - lrq->last_tick > TICK_NSEC))
+ lrq->last_tick = now;
+
/*
* Dequeue and enqueue the task to update its
* position within the tree:
--
Regards,
vatsa
next prev parent reply other threads:[~2007-06-11 15:46 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-11 15:47 [RFC][PATCH 0/6] Add group fairness to CFS - v1 Srivatsa Vaddagiri
2007-06-11 15:50 ` [RFC][PATCH 1/6] Introduce struct sched_entity and struct lrq Srivatsa Vaddagiri
2007-06-11 18:48 ` Linus Torvalds
2007-06-11 18:56 ` Ingo Molnar
2007-06-12 2:15 ` [ckrm-tech] " Balbir Singh
2007-06-12 3:52 ` Srivatsa Vaddagiri
2007-06-11 15:52 ` [RFC][PATCH 2/6] task's cpu information needs to be always correct Srivatsa Vaddagiri
2007-06-12 2:17 ` [ckrm-tech] " Balbir Singh
2007-06-11 15:53 ` [RFC][PATCH 3/6] core changes in CFS Srivatsa Vaddagiri
2007-06-12 2:29 ` Balbir Singh
2007-06-12 4:22 ` Srivatsa Vaddagiri
2007-06-11 15:55 ` Srivatsa Vaddagiri [this message]
2007-06-12 9:03 ` [RFC][PATCH 4/6] Fix (bad?) interactions between SCHED_RT and SCHED_NORMAL tasks Dmitry Adamushko
2007-06-12 10:26 ` Srivatsa Vaddagiri
2007-06-12 12:23 ` Dmitry Adamushko
2007-06-12 13:30 ` Srivatsa Vaddagiri
2007-06-12 14:31 ` Dmitry Adamushko
2007-06-12 15:43 ` Srivatsa Vaddagiri
2007-06-11 15:56 ` [RFC][PATCH 5/6] core changes for group fairness Srivatsa Vaddagiri
2007-06-13 20:56 ` Dmitry Adamushko
2007-06-14 12:06 ` Srivatsa Vaddagiri
2007-06-11 15:58 ` [RFC][PATCH 6/6] Hook up to container infrastructure Srivatsa Vaddagiri
2007-06-11 16:02 ` [RFC][PATCH 0/6] Add group fairness to CFS - v1 Srivatsa Vaddagiri
2007-06-11 19:37 ` Ingo Molnar
2007-06-11 19:39 ` Ingo Molnar
2007-06-12 5:50 ` Srivatsa Vaddagiri
2007-06-12 6:26 ` Ingo Molnar
[not found] ` <20070612072742.GA785@in.ibm.com>
2007-06-12 10:56 ` Srivatsa Vaddagiri
2007-06-15 12:46 ` Kirill Korotaev
2007-06-15 14:06 ` Srivatsa Vaddagiri
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070611155504.GD2109@in.ibm.com \
--to=vatsa@linux.vnet.ibm.com \
--cc=akpm@linux-foundation.org \
--cc=balbir@in.ibm.com \
--cc=ckrm-tech@lists.sourceforge.net \
--cc=containers@lists.osdl.org \
--cc=dmitry.adamushko@gmail.com \
--cc=efault@gmx.de \
--cc=kernel@kolivas.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=nickpiggin@yahoo.com.au \
--cc=pwil3058@bigpond.net.au \
--cc=tingy@cs.umass.edu \
--cc=tong.n.li@intel.com \
--cc=torvalds@linux-foundation.org \
--cc=wli@holomorphy.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox