From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>,
efault@gmx.de, kernel@kolivas.org, containers@lists.osdl.org,
ckrm-tech@lists.sourceforge.net, torvalds@linux-foundation.org,
akpm@linux-foundation.org, pwil3058@bigpond.net.au,
tingy@cs.umass.edu, tong.n.li@intel.com, wli@holomorphy.com,
linux-kernel@vger.kernel.org, dmitry.adamushko@gmail.com,
balbir@in.ibm.com
Subject: [RFC][PATCH 4/6] Fix (bad?) interactions between SCHED_RT and SCHED_NORMAL tasks
Date: Mon, 11 Jun 2007 21:25:04 +0530 [thread overview]
Message-ID: <20070611155504.GD2109@in.ibm.com> (raw)
In-Reply-To: <20070611154724.GA32435@in.ibm.com>
Currently nr_running and raw_weighted_load fields in runqueue affect
some CFS calculations (like distribute_fair_add, enqueue_sleeper etc).
These fields however are shared between tasks of all classes, which can
potentialy affect those calculations for SCHED_NORMAL tasks. However I
do not know of any bad behaviour caused by not splitting these fields (like
this patch does).
This split is neverthless needed for subsequent patches.
Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
kernel/sched.c | 134 +++++++++++++++++++++++++---------------------------
kernel/sched_fair.c | 65 ++++++++++++++++++++++++-
2 files changed, 128 insertions(+), 71 deletions(-)
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c 2007-06-09 15:07:32.000000000 +0530
+++ current/kernel/sched.c 2007-06-09 15:07:36.000000000 +0530
@@ -118,6 +118,7 @@
/* CFS-related fields in a runqueue */
struct lrq {
+ long nr_running;
unsigned long raw_weighted_load;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
@@ -125,6 +126,7 @@
u64 fair_clock, delta_fair_clock;
u64 exec_clock, delta_exec_clock;
+ u64 last_tick; /* when did we last smoothen cpu load? */
s64 wait_runtime;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
@@ -148,12 +150,18 @@
* remote CPUs use both these fields when doing load calculation.
*/
long nr_running;
- struct lrq lrq;
+ unsigned long raw_weighted_load;
+#ifdef CONFIG_SMP
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
+#endif
+ struct lrq lrq;
+
u64 nr_switches;
/*
@@ -589,13 +597,13 @@
static inline void
inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- rq->lrq.raw_weighted_load += p->se.load_weight;
+ rq->raw_weighted_load += p->se.load_weight;
}
static inline void
dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- rq->lrq.raw_weighted_load -= p->se.load_weight;
+ rq->raw_weighted_load -= p->se.load_weight;
}
static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -741,7 +749,7 @@
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->lrq.raw_weighted_load;
+ return cpu_rq(cpu)->raw_weighted_load;
}
#ifdef CONFIG_SMP
@@ -876,9 +884,9 @@
struct rq *rq = cpu_rq(cpu);
if (type == 0)
- return rq->lrq.raw_weighted_load;
+ return rq->raw_weighted_load;
- return min(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+ return min(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
@@ -890,9 +898,9 @@
struct rq *rq = cpu_rq(cpu);
if (type == 0)
- return rq->lrq.raw_weighted_load;
+ return rq->raw_weighted_load;
- return max(rq->lrq.cpu_load[type-1], rq->lrq.raw_weighted_load);
+ return max(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
@@ -903,7 +911,7 @@
struct rq *rq = cpu_rq(cpu);
unsigned long n = rq->nr_running;
- return n ? rq->lrq.raw_weighted_load / n : SCHED_LOAD_SCALE;
+ return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
}
/*
@@ -1592,54 +1600,6 @@
return running + uninterruptible;
}
-static void update_load_fair(struct rq *this_rq)
-{
- unsigned long this_load, fair_delta, exec_delta, idle_delta;
- u64 fair_delta64, exec_delta64, tmp64;
- unsigned int i, scale;
-
- this_rq->lrq.nr_load_updates++;
- if (!(sysctl_sched_features & 64)) {
- this_load = this_rq->lrq.raw_weighted_load;
- goto do_avg;
- }
-
- fair_delta64 = this_rq->lrq.delta_fair_clock + 1;
- this_rq->lrq.delta_fair_clock = 0;
-
- exec_delta64 = this_rq->lrq.delta_exec_clock + 1;
- this_rq->lrq.delta_exec_clock = 0;
-
- if (fair_delta64 > (u64)LONG_MAX)
- fair_delta64 = (u64)LONG_MAX;
- fair_delta = (unsigned long)fair_delta64;
-
- if (exec_delta64 > (u64)TICK_NSEC)
- exec_delta64 = (u64)TICK_NSEC;
- exec_delta = (unsigned long)exec_delta64;
-
- idle_delta = TICK_NSEC - exec_delta;
-
- tmp64 = SCHED_LOAD_SCALE * exec_delta64;
- do_div(tmp64, fair_delta);
- tmp64 *= exec_delta64;
- do_div(tmp64, TICK_NSEC);
- this_load = (unsigned long)tmp64;
-
-do_avg:
- /* Update our load: */
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->lrq.cpu_load[i];
- new_load = this_load;
-
- this_rq->lrq.cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
- }
-}
-
#ifdef CONFIG_SMP
/*
@@ -2003,7 +1963,7 @@
avg_load += load;
sum_nr_running += rq->nr_running;
- sum_weighted_load += rq->lrq.raw_weighted_load;
+ sum_weighted_load += rq->raw_weighted_load;
}
/*
@@ -2238,11 +2198,11 @@
rq = cpu_rq(i);
if (rq->nr_running == 1 &&
- rq->lrq.raw_weighted_load > imbalance)
+ rq->raw_weighted_load > imbalance)
continue;
- if (rq->lrq.raw_weighted_load > max_load) {
- max_load = rq->lrq.raw_weighted_load;
+ if (rq->raw_weighted_load > max_load) {
+ max_load = rq->raw_weighted_load;
busiest = rq;
}
}
@@ -2576,6 +2536,32 @@
spin_unlock(&target_rq->lock);
}
+static void update_load(struct rq *this_rq)
+{
+ unsigned long this_load;
+ unsigned int i, scale;
+
+ this_load = this_rq->raw_weighted_load;
+
+ /* Update our load: */
+ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ }
+}
+
#ifdef CONFIG_NO_HZ
static struct {
atomic_t load_balancer;
@@ -2822,14 +2808,14 @@
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
}
-#else
+#else /* CONFIG_SMP */
/*
* on UP we do not need to balance between CPUs:
*/
static inline void idle_balance(int cpu, struct rq *rq)
{
}
-#endif
+#endif /* CONFIG_SMP */
DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2953,8 +2939,8 @@
if (!idle_at_tick)
task_running_tick(rq, p);
- update_load_fair(rq);
#ifdef CONFIG_SMP
+ update_load(rq);
rq->idle_at_tick = idle_at_tick;
trigger_load_balance(cpu);
#endif
@@ -6090,6 +6076,18 @@
&& addr < (unsigned long)__sched_text_end);
}
+static inline void init_lrq(struct lrq *lrq, struct rq *rq)
+{
+ int j;
+
+ lrq->tasks_timeline = RB_ROOT;
+ lrq->fair_clock = 1;
+ lrq->last_tick = rq_clock(rq);
+ lrq->nr_running = 0;
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ lrq->cpu_load[j] = 0;
+}
+
void __init sched_init(void)
{
int highest_cpu = 0;
@@ -6110,12 +6108,12 @@
spin_lock_init(&rq->lock);
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
- rq->lrq.tasks_timeline = RB_ROOT;
- rq->clock = rq->lrq.fair_clock = 1;
+ rq->clock = 1;
+ init_lrq(&rq->lrq, rq);
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
- rq->lrq.cpu_load[j] = 0;
#ifdef CONFIG_SMP
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ rq->cpu_load[j] = 0;
rq->sd = NULL;
rq->active_balance = 0;
rq->push_cpu = 0;
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c 2007-06-09 15:07:33.000000000 +0530
+++ current/kernel/sched_fair.c 2007-06-09 15:07:36.000000000 +0530
@@ -64,9 +64,7 @@
static long lrq_nr_running(struct lrq *lrq)
{
- struct rq *rq = lrq_rq(lrq);
-
- return rq->nr_running;
+ return lrq->nr_running;
}
#define entity_is_task(se) 1
@@ -119,6 +117,8 @@
rb_link_node(&p->run_node, parent, link);
rb_insert_color(&p->run_node, &lrq->tasks_timeline);
+ lrq->raw_weighted_load += p->load_weight;
+ lrq->nr_running++;
}
static inline void __dequeue_entity(struct lrq *lrq, struct sched_entity *p)
@@ -126,6 +126,8 @@
if (lrq->rb_leftmost == &p->run_node)
lrq->rb_leftmost = NULL;
rb_erase(&p->run_node, &lrq->tasks_timeline);
+ lrq->raw_weighted_load -= p->load_weight;
+ lrq->nr_running--;
}
static inline struct rb_node * first_fair(struct lrq *lrq)
@@ -570,12 +572,69 @@
update_stats_wait_start(lrq, prev, now);
}
+static void update_load_fair(struct lrq *this_lrq)
+{
+ unsigned long this_load, fair_delta, exec_delta, idle_delta;
+ u64 fair_delta64, exec_delta64, tmp64;
+ unsigned int i, scale;
+
+ this_lrq->nr_load_updates++;
+ if (!(sysctl_sched_features & 64)) {
+ this_load = this_lrq->raw_weighted_load;
+ goto do_avg;
+ }
+
+ fair_delta64 = this_lrq->delta_fair_clock + 1;
+ this_lrq->delta_fair_clock = 0;
+
+ exec_delta64 = this_lrq->delta_exec_clock + 1;
+ this_lrq->delta_exec_clock = 0;
+
+ if (fair_delta64 > (u64)LONG_MAX)
+ fair_delta64 = (u64)LONG_MAX;
+ fair_delta = (unsigned long)fair_delta64;
+
+ if (exec_delta64 > (u64)TICK_NSEC)
+ exec_delta64 = (u64)TICK_NSEC;
+ exec_delta = (unsigned long)exec_delta64;
+
+ idle_delta = TICK_NSEC - exec_delta;
+
+ tmp64 = SCHED_LOAD_SCALE * exec_delta64;
+ do_div(tmp64, fair_delta);
+ tmp64 *= exec_delta64;
+ do_div(tmp64, TICK_NSEC);
+ this_load = (unsigned long)tmp64;
+
+do_avg:
+ /* Update our load: */
+ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_lrq->cpu_load[i];
+ new_load = this_load;
+
+ this_lrq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ }
+}
+
static void entity_tick(struct lrq *lrq, struct sched_entity *curr)
{
struct sched_entity *next;
struct rq *rq = lrq_rq(lrq);
u64 now = __rq_clock(rq);
+ /* replay load smoothening for all ticks we lost */
+ while (time_after_eq64(now, lrq->last_tick)) {
+ update_load_fair(lrq);
+ lrq->last_tick += TICK_NSEC;
+ }
+ /* deal with time wraps */
+ if (unlikely(now - lrq->last_tick > TICK_NSEC))
+ lrq->last_tick = now;
+
/*
* Dequeue and enqueue the task to update its
* position within the tree:
--
Regards,
vatsa
next prev parent reply other threads:[~2007-06-11 15:46 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-11 15:47 [RFC][PATCH 0/6] Add group fairness to CFS - v1 Srivatsa Vaddagiri
2007-06-11 15:50 ` [RFC][PATCH 1/6] Introduce struct sched_entity and struct lrq Srivatsa Vaddagiri
2007-06-11 18:48 ` Linus Torvalds
2007-06-11 18:56 ` Ingo Molnar
2007-06-12 2:15 ` [ckrm-tech] " Balbir Singh
2007-06-12 3:52 ` Srivatsa Vaddagiri
2007-06-11 15:52 ` [RFC][PATCH 2/6] task's cpu information needs to be always correct Srivatsa Vaddagiri
2007-06-12 2:17 ` [ckrm-tech] " Balbir Singh
2007-06-11 15:53 ` [RFC][PATCH 3/6] core changes in CFS Srivatsa Vaddagiri
2007-06-12 2:29 ` Balbir Singh
2007-06-12 4:22 ` Srivatsa Vaddagiri
2007-06-11 15:55 ` Srivatsa Vaddagiri [this message]
2007-06-12 9:03 ` [RFC][PATCH 4/6] Fix (bad?) interactions between SCHED_RT and SCHED_NORMAL tasks Dmitry Adamushko
2007-06-12 10:26 ` Srivatsa Vaddagiri
2007-06-12 12:23 ` Dmitry Adamushko
2007-06-12 13:30 ` Srivatsa Vaddagiri
2007-06-12 14:31 ` Dmitry Adamushko
2007-06-12 15:43 ` Srivatsa Vaddagiri
2007-06-11 15:56 ` [RFC][PATCH 5/6] core changes for group fairness Srivatsa Vaddagiri
2007-06-13 20:56 ` Dmitry Adamushko
2007-06-14 12:06 ` Srivatsa Vaddagiri
2007-06-11 15:58 ` [RFC][PATCH 6/6] Hook up to container infrastructure Srivatsa Vaddagiri
2007-06-11 16:02 ` [RFC][PATCH 0/6] Add group fairness to CFS - v1 Srivatsa Vaddagiri
2007-06-11 19:37 ` Ingo Molnar
2007-06-11 19:39 ` Ingo Molnar
2007-06-12 5:50 ` Srivatsa Vaddagiri
2007-06-12 6:26 ` Ingo Molnar
[not found] ` <20070612072742.GA785@in.ibm.com>
2007-06-12 10:56 ` Srivatsa Vaddagiri
2007-06-15 12:46 ` Kirill Korotaev
2007-06-15 14:06 ` Srivatsa Vaddagiri
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070611155504.GD2109@in.ibm.com \
--to=vatsa@linux.vnet.ibm.com \
--cc=akpm@linux-foundation.org \
--cc=balbir@in.ibm.com \
--cc=ckrm-tech@lists.sourceforge.net \
--cc=containers@lists.osdl.org \
--cc=dmitry.adamushko@gmail.com \
--cc=efault@gmx.de \
--cc=kernel@kolivas.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=nickpiggin@yahoo.com.au \
--cc=pwil3058@bigpond.net.au \
--cc=tingy@cs.umass.edu \
--cc=tong.n.li@intel.com \
--cc=torvalds@linux-foundation.org \
--cc=wli@holomorphy.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.