* [PATCH] sched: staircase deadline misc fixes
@ 2007-03-28 16:37 Con Kolivas
2007-03-28 17:34 ` [ck] " Prakash Punnoor
` (3 more replies)
0 siblings, 4 replies; 92+ messages in thread
From: Con Kolivas @ 2007-03-28 16:37 UTC (permalink / raw)
To: linux list, Andrew Morton, Ingo Molnar, Andy Whitcroft, ck list
test.kernel.org found some idle time regressions in the latest update to the
staircase deadline scheduler and Andy Whitcroft helped me track down the
offending problem which was present in all previous RSDL schedulers but
previously wouldn't be manifest without changes in nice. So here is a bugfix
for the set_load_weight being incorrectly set and a few other minor
improvements. Thanks Andy!
I'm cautiously optimistic that we're at the thin edge of the bugfix wedge now.
---
set_load_weight() should be performed after p->quota is set. This fixes a
large SMP performance regression.
Make sure rr_interval is never set to less than one jiffy.
Some sanity checking in update_cpu_clock will prevent bogus sched_clock
values.
SCHED_BATCH tasks should not set the rq->best_static_prio field.
Correct sysctl rr_interval description to describe the value in milliseconds.
Style fixes.
Signed-off-by: Con Kolivas <kernel@kolivas.org>
---
Documentation/sysctl/kernel.txt | 8 ++--
kernel/sched.c | 73 +++++++++++++++++++++++++++++-----------
2 files changed, 58 insertions(+), 23 deletions(-)
Index: linux-2.6.21-rc5-mm2/kernel/sched.c
===================================================================
--- linux-2.6.21-rc5-mm2.orig/kernel/sched.c 2007-03-28 09:01:03.000000000 +1000
+++ linux-2.6.21-rc5-mm2/kernel/sched.c 2007-03-29 00:02:33.000000000 +1000
@@ -88,10 +88,13 @@ unsigned long long __attribute__((weak))
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO)
-/* Some helpers for converting to/from nanosecond timing */
+/* Some helpers for converting to/from various scales.*/
#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
-#define NS_TO_MS(TIME) ((TIME) / 1000000)
+#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
#define MS_TO_NS(TIME) ((TIME) * 1000000)
+/* Can return 0 */
+#define MS_TO_JIFFIES(TIME) ((TIME) * HZ / 1000)
+#define JIFFIES_TO_MS(TIME) ((TIME) * 1000 / HZ)
#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio)
@@ -852,16 +855,15 @@ static void requeue_task(struct task_str
/*
* task_timeslice - the total duration a task can run during one major
- * rotation.
+ * rotation. Returns value in jiffies.
*/
static inline int task_timeslice(struct task_struct *p)
{
- int slice, rr;
+ int slice;
- slice = rr = p->quota;
+ slice = NS_TO_JIFFIES(p->quota);
if (!rt_task(p))
- slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
- slice = NS_TO_JIFFIES(slice) ? : 1;
+ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
return slice;
}
@@ -875,7 +877,7 @@ static inline int task_timeslice(struct
(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(task_timeslice(p))
#define RTPRIO_TO_LOAD_WEIGHT(rp) \
- (LOAD_WEIGHT((rr_interval + 20 + (rp))))
+ (LOAD_WEIGHT((MS_TO_JIFFIES(rr_interval) + 20 + (rp))))
static void set_load_weight(struct task_struct *p)
{
@@ -973,11 +975,15 @@ static int effective_prio(struct task_st
* tick still. Below nice 0 they get progressively larger.
* ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
* nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
+ * Value returned is in nanoseconds.
*/
static unsigned int rr_quota(struct task_struct *p)
{
int nice = TASK_NICE(p), rr = rr_interval;
+ /* Ensure that rr_interval is at least 1 tick */
+ if (unlikely(!MS_TO_JIFFIES(rr)))
+ rr = rr_interval = JIFFIES_TO_MS(1) ? : 1;
if (!rt_task(p)) {
if (nice < -6) {
rr *= nice * nice;
@@ -3198,13 +3204,34 @@ EXPORT_PER_CPU_SYMBOL(kstat);
/*
* This is called on clock ticks and on context switches.
* Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here.
+ * The value returned from sched_clock() occasionally gives bogus values so
+ * some sanity checking is required.
*/
static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
+ int tick)
{
cputime64_t time_diff = now - p->last_ran;
+ unsigned int min_diff = 1000;
- /* cpu scheduler quota accounting is performed here */
+ if (tick) {
+ /*
+ * Called from scheduler_tick() there should be less than two
+ * jiffies worth, and not negative/overflow.
+ */
+ if (time_diff > JIFFIES_TO_NS(2) || time_diff < min_diff)
+ time_diff = JIFFIES_TO_NS(1);
+ } else {
+ /*
+ * Called from context_switch there should be less than one
+ * jiffy worth, and not negative/overflowed. In the case when
+ * sched_clock fails to return high resolution values this
+ * also ensures at least 1 min_diff gets banked.
+ */
+ if (time_diff > JIFFIES_TO_NS(1) || time_diff < min_diff)
+ time_diff = min_diff;
+ }
if (p != rq->idle && p->policy != SCHED_FIFO)
p->time_slice -= time_diff;
p->sched_time += time_diff;
@@ -3353,7 +3380,7 @@ void scheduler_tick(void)
int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
- update_cpu_clock(p, rq, now);
+ update_cpu_clock(p, rq, now, 1);
if (!idle_at_tick)
task_running_tick(rq, p);
@@ -3425,7 +3452,7 @@ retry:
}
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list);
- if (unlikely(next->time_slice < 0)) {
+ if (unlikely(next->time_slice <= 0)) {
/*
* Unlucky enough that this task ran out of time_slice
* before it hit a scheduler_tick so it should have its
@@ -3438,7 +3465,8 @@ retry:
}
rq->prio_level = idx;
next->rotation = rq->prio_rotation;
- if (next->static_prio < rq->best_static_prio)
+ if (next->static_prio < rq->best_static_prio &&
+ next->policy != SCHED_BATCH)
rq->best_static_prio = next->static_prio;
return next;
}
@@ -3533,7 +3561,7 @@ switch_tasks:
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
- update_cpu_clock(prev, rq, now);
+ update_cpu_clock(prev, rq, now, 0);
prev->timestamp = prev->last_ran = now;
sched_info_switch(prev, next);
@@ -3978,7 +4006,8 @@ void rt_mutex_setprio(struct task_struct
rq = task_rq_lock(p, &flags);
oldprio = p->prio;
- if ((queued = task_queued(p)))
+ queued = task_queued(p);
+ if (queued)
dequeue_task(p, rq);
p->prio = prio;
@@ -4023,15 +4052,17 @@ void set_user_nice(struct task_struct *p
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- if ((queued = task_queued(p))) {
+ queued = task_queued(p);
+ if (queued) {
dequeue_task(p, rq);
dec_raw_weighted_load(rq, p);
}
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
old_prio = p->prio;
p->prio = effective_prio(p);
+ p->quota = rr_quota(p);
+ set_load_weight(p);
delta = p->prio - old_prio;
if (queued) {
@@ -4045,7 +4076,6 @@ void set_user_nice(struct task_struct *p
resched_task(rq->curr);
}
out_unlock:
- p->quota = rr_quota(p);
task_rq_unlock(rq, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -4166,6 +4196,7 @@ static void __setscheduler(struct task_s
p->normal_prio = normal_prio(p);
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
+ p->quota = rr_quota(p);
set_load_weight(p);
}
@@ -4254,7 +4285,8 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- if ((queued = task_queued(p)))
+ queued = task_queued(p);
+ if (queued)
deactivate_task(p, rq);
oldprio = p->prio;
__setscheduler(p, policy, param->sched_priority);
@@ -7088,7 +7120,8 @@ void normalize_rt_tasks(void)
spin_lock_irqsave(&p->pi_lock, flags);
rq = __task_rq_lock(p);
- if ((queued = task_queued(p)))
+ queued = task_queued(p);
+ if (queued)
deactivate_task(p, task_rq(p));
__setscheduler(p, SCHED_NORMAL, 0);
if (queued) {
Index: linux-2.6.21-rc5-mm2/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.21-rc5-mm2.orig/Documentation/sysctl/kernel.txt 2007-03-28 09:01:03.000000000 +1000
+++ linux-2.6.21-rc5-mm2/Documentation/sysctl/kernel.txt 2007-03-28 09:01:04.000000000 +1000
@@ -294,9 +294,11 @@ rr_interval:
This is the smallest duration that any cpu process scheduling unit
will run for. Increasing this value can increase throughput of cpu
bound tasks substantially but at the expense of increased latencies
-overall. This value is in _ticks_ and the default value chosen depends
-on the number of cpus available at scheduler initialisation. Valid
-values are from 1-100.
+overall. This value is in milliseconds and the default value chosen
+depends on the number of cpus available at scheduler initialisation
+with a minimum of 8.
+
+Valid values are from 1-100.
==============================================================
--
-ck
^ permalink raw reply [flat|nested] 92+ messages in thread* Re: [ck] [PATCH] sched: staircase deadline misc fixes 2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas @ 2007-03-28 17:34 ` Prakash Punnoor 2007-04-01 6:40 ` Prakash Punnoor 2007-03-28 18:48 ` Ingo Molnar ` (2 subsequent siblings) 3 siblings, 1 reply; 92+ messages in thread From: Prakash Punnoor @ 2007-03-28 17:34 UTC (permalink / raw) To: ck; +Cc: Con Kolivas, linux list [-- Attachment #1: Type: text/plain, Size: 848 bytes --] Am Mittwoch 28 März 2007 schrieb Con Kolivas: > I'm cautiously optimistic that we're at the thin edge of the bugfix wedge > now. > > --- > set_load_weight() should be performed after p->quota is set. This fixes a > large SMP performance regression. Hi, I am using 2.6.21-rc5 with rsdl 0.37 and think I still see a regression with my Athlon X2. Namely using this ac3 encoder (http://aften.sourceforge.net/), which I parallelized in a simple way, with my test sample I remember having encoding times of ~5.4sec with vanilla and ~5.8 sec with rsdl - once the whole test wave is in cache. Otherwise you can easily I/O limit the encoder. ;-) You need to get sources from svn though. The current 0.06 release doesn't have threads support. Cheers, -- (°= =°) //\ Prakash Punnoor /\\ V_/ \_V [-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [ck] [PATCH] sched: staircase deadline misc fixes 2007-03-28 17:34 ` [ck] " Prakash Punnoor @ 2007-04-01 6:40 ` Prakash Punnoor [not found] ` <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com> 0 siblings, 1 reply; 92+ messages in thread From: Prakash Punnoor @ 2007-04-01 6:40 UTC (permalink / raw) To: ck; +Cc: linux list [-- Attachment #1: Type: text/plain, Size: 1178 bytes --] Am Mittwoch 28 März 2007 schrieb Prakash Punnoor: > Am Mittwoch 28 März 2007 schrieb Con Kolivas: > > I'm cautiously optimistic that we're at the thin edge of the bugfix wedge > > now. > > > > --- > > set_load_weight() should be performed after p->quota is set. This fixes a > > large SMP performance regression. > > Hi, I am using 2.6.21-rc5 with rsdl 0.37 and think I still see a regression > with my Athlon X2. Namely using this ac3 encoder > (http://aften.sourceforge.net/), which I parallelized in a simple way, with > my test sample I remember having encoding times of ~5.4sec with vanilla and > ~5.8 sec with rsdl - once the whole test wave is in cache. Otherwise you > can easily I/O limit the encoder. ;-) You need to get sources from svn > though. The current 0.06 release doesn't have threads support. BTW, I confirmed this regression. With vanilla 2.76.21-rc5 I get back my 5.4 secs with the test sample and two threads. Furtmermore for me vanilla actually feels nicer on my dual core, even with load - just subjectively that's why I ditched rsdl... Cheers, -- (°= =°) //\ Prakash Punnoor /\\ V_/ \_V [-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 92+ messages in thread
[parent not found: <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com>]
* Re: [ck] [PATCH] sched: staircase deadline misc fixes [not found] ` <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com> @ 2007-04-01 20:03 ` Prakash Punnoor 0 siblings, 0 replies; 92+ messages in thread From: Prakash Punnoor @ 2007-04-01 20:03 UTC (permalink / raw) To: michael chang; +Cc: ck, linux list [-- Attachment #1: Type: text/plain, Size: 1261 bytes --] Am Sonntag 01 April 2007 schrieb michael chang: > On 4/1/07, Prakash Punnoor <prakash@punnoor.de> wrote: > > Am Mittwoch 28 März 2007 schrieb Prakash Punnoor: > > > > > > Hi, I am using 2.6.21-rc5 with rsdl 0.37 and think I still see a > > > regression with my Athlon X2. Namely using this ac3 encoder > > > (http://aften.sourceforge.net/), which I parallelized in a simple way, > > > with my test sample I remember having encoding times of ~5.4sec with > > > vanilla and ~5.8 sec with rsdl - once the whole test wave is in cache. > > BTW, I confirmed this regression. With vanilla 2.76.21-rc5 I get back my > > 5.4 secs with the test sample and two threads. Furtmermore for me vanilla > > Which version of RSDL were you comparing to 2.6.21-rc5? Did you try > the patch in the first message (http://lkml.org/lkml/2007/3/28/146)? > The patch that _began_ this thread had SMP fixes in it... (Also, IIRC, > the latest version of the scheduler no longer has the rotating > component - so it's just SDl now.) As I said, I tried 0.37. Didn't it have the fix inside? Actually I am reluctant to go back to (r)sdl, as it didn't show improvements for me, yet. -- (°= =°) //\ Prakash Punnoor /\\ V_/ \_V [-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas 2007-03-28 17:34 ` [ck] " Prakash Punnoor @ 2007-03-28 18:48 ` Ingo Molnar 2007-03-28 23:44 ` Con Kolivas 2007-03-29 6:36 ` Con Kolivas 2007-04-23 8:58 ` Andrew Morton 3 siblings, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-03-28 18:48 UTC (permalink / raw) To: Con Kolivas; +Cc: linux list, Andrew Morton * Con Kolivas <kernel@kolivas.org> wrote: > I'm cautiously optimistic that we're at the thin edge of the bugfix > wedge now. hm, how about the questions Mike raised (there were a couple of cases of friction between 'the design as documented and announced' and 'the code as implemented')? As far as i saw they were still largely unanswered - but let me know if they are all answered and addressed: http://marc.info/?l=linux-kernel&m=117465220309006&w=2 http://marc.info/?l=linux-kernel&m=117489673929124&w=2 http://marc.info/?l=linux-kernel&m=117489831930240&w=2 and the numbers he posted: http://marc.info/?l=linux-kernel&m=117448900626028&w=2 his test conclusion was that under CPU load, RSDL (SD) generally does not hold up to mainline's interactivity. Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-28 18:48 ` Ingo Molnar @ 2007-03-28 23:44 ` Con Kolivas 2007-03-29 5:50 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Con Kolivas @ 2007-03-28 23:44 UTC (permalink / raw) To: Ingo Molnar; +Cc: linux list, Andrew Morton, ck list On Thursday 29 March 2007 04:48, Ingo Molnar wrote: > hm, how about the questions Mike raised (there were a couple of cases of > friction between 'the design as documented and announced' and 'the code > as implemented')? As far as i saw they were still largely unanswered - > but let me know if they are all answered and addressed: I spent less time emailing and more time coding. I have been working on addressing whatever people brought up. > http://marc.info/?l=linux-kernel&m=117465220309006&w=2 Attended to. > http://marc.info/?l=linux-kernel&m=117489673929124&w=2 Attended to. > http://marc.info/?l=linux-kernel&m=117489831930240&w=2 Checked fine. > and the numbers he posted: > > http://marc.info/?l=linux-kernel&m=117448900626028&w=2 Attended to. > his test conclusion was that under CPU load, RSDL (SD) generally does > not hold up to mainline's interactivity. There have been improvements since the earlier iterations but it's still a fairness based design. Mike's "sticking point" test case should be improved as well. My call based on my own testing and feedback from users is: Under niced loads it is 99% in favour of SD. Under light loads it is 95% in favour of SD. Under Heavy loads it becomes proportionately in favour of mainline. The crossover is somewhere around a load of 4. If the reluctance to renice X goes away I'd say it was 99% across the board and to much higher loads. > Ingo -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-28 23:44 ` Con Kolivas @ 2007-03-29 5:50 ` Mike Galbraith 2007-03-29 6:29 ` Mike Galbraith ` (2 more replies) 0 siblings, 3 replies; 92+ messages in thread From: Mike Galbraith @ 2007-03-29 5:50 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote: > On Thursday 29 March 2007 04:48, Ingo Molnar wrote: > > hm, how about the questions Mike raised (there were a couple of cases of > > friction between 'the design as documented and announced' and 'the code > > as implemented')? As far as i saw they were still largely unanswered - > > but let me know if they are all answered and addressed: > > I spent less time emailing and more time coding. I have been working on > addressing whatever people brought up. > > > http://marc.info/?l=linux-kernel&m=117465220309006&w=2 > > Attended to. > > > http://marc.info/?l=linux-kernel&m=117489673929124&w=2 > > Attended to. > > > http://marc.info/?l=linux-kernel&m=117489831930240&w=2 > > Checked fine. That one's not fine. +static void recalc_task_prio(struct task_struct *p, struct rq *rq) +{ + struct prio_array *array = rq->active; + int queue_prio; + + update_if_moved(p, rq); + if (p->rotation == rq->prio_rotation) { + if (p->array == array) { + if (p->time_slice > 0) + return; + p->time_slice = p->quota; + } else if (p->array == rq->expired) { You implemented nanosecond accounting, but here you give a task which has either missed the tick ofter enough, or accumulated enough cross cpu clock drift to have an I.O.U. in it's wallet a shiny new $8 bill. WRT clock drift/timewarps, your latest code cedes that these do occur, but where these timewarps can be anywhere between minuscule with Intel same package processors, up to a tick elsewhere, charges a tick. - /* cpu scheduler quota accounting is performed here */ + if (tick) { + /* + * Called from scheduler_tick() there should be less than two + * jiffies worth, and not negative/overflow. + */ + if (time_diff > JIFFIES_TO_NS(2) || time_diff < min_diff) + time_diff = JIFFIES_TO_NS(1); > > and the numbers he posted: > > > > http://marc.info/?l=linux-kernel&m=117448900626028&w=2 > > Attended to. Hm. How, where? I'm getting inconsistent results with current, but sleeping tasks still don't _appear_ to be able to compete with hogs on an equal footing, and I don't see how they really can. What happens if a sleeper sleeps after using say half of it's slice, and the hog it's sharing the CPU with then sleeps briefly after using most of it's slice. That's the end of the rotation. They are put back on an equal footing, but what just happened to the differential in cpu usage? > > his test conclusion was that under CPU load, RSDL (SD) generally does > > not hold up to mainline's interactivity. > > There have been improvements since the earlier iterations but it's still a > fairness based design. Mike's "sticking point" test case should be improved > as well. The behavior is different, and is less ragged, but I wouldn't say it's really been improved. The below was added as a workaround. + * This contains a bitmap for each dynamic priority level with empty slots + * for the valid priorities each different nice level can have. It allows + * us to stagger the slots where differing priorities run in a way that + * keeps latency differences between different nice levels at a minimum. + * ie, where 0 means a slot for that priority, priority running from left to + * right: + * nice -20 0000000000000000000000000000000000000000 + * nice -10 1001000100100010001001000100010010001000 + * nice 0 0101010101010101010101010101010101010101 + * nice 5 1101011010110101101011010110101101011011 + * nice 10 0110111011011101110110111011101101110111 + * nice 15 0111110111111011111101111101111110111111 + * nice 19 1111111111111111111011111111111111111111 I don't really know what to say about this. I think it explains reduced context switching, but I don't see how this could be a good thing. Consider a nice -20 fast/light task trying to get CPU with nice 0 tasks being constantly spawned. How can this latency bound fast mover perform if it can't preempt? What am I missing? > My call based on my own testing and feedback from users is: > > Under niced loads it is 99% in favour of SD. > > Under light loads it is 95% in favour of SD. > > Under Heavy loads it becomes proportionately in favour of mainline. The > crossover is somewhere around a load of 4. Opinion polls are nice, but I'm more interested in gathering numbers which either validate or invalidate the claims of the design documents. WRT this subjective opinion thing, I see regressions with all loads, and I don't see what a < 95% load really means. If CPU isn't contended, dishing it out is dirt simple. Just give everybody frequent, and fairly short chunks, and everybody is fairly happy. The only time scheduling becomes interesting is when there IS contention, and mainline seems to do much better at this, with the caveat that the history mechanism indeed doesn't always get it right. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-29 5:50 ` Mike Galbraith @ 2007-03-29 6:29 ` Mike Galbraith 2007-03-29 6:54 ` Mike Galbraith 2007-03-29 8:18 ` Mike Galbraith 2007-04-03 2:37 ` Con Kolivas 2 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-03-29 6:29 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote: > Opinion polls are nice, but I'm more interested in gathering numbers > which either validate or invalidate the claims of the design documents. Suggestion: try the testcase that Satoru Takeuch posted. The numbers I got with latest SD were no better than the numbers I got with the patch I posted to try to solve it. Seems to me the numbers with SD should have been much better, but they in fact were not. Running that thing, mainline's GUI was not usable, even with my patch, but neither was it usable with SD. What's the difference between horrible with mainline and merely terrible with SD? In both, the GUI ends up doing round-robin with a slew of hogs. In mainline, this happens because the history logic can and does get it wrong sometimes, which this exploit deliberately triggers. With SD, it's by design. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-29 6:29 ` Mike Galbraith @ 2007-03-29 6:54 ` Mike Galbraith 0 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-03-29 6:54 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list Oh my, I'm on a roll here... somebody stop me ;-) Some emphasis: On Thu, 2007-03-29 at 08:29 +0200, Mike Galbraith wrote: > On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote: > > > Opinion polls are nice, but I'm more interested in gathering numbers > > which either validate or invalidate the claims of the design documents. > > Suggestion: try the testcase that Satoru Takeuch posted. The numbers I > got with latest SD were no better than the numbers I got with the patch > I posted to try to solve it. Seems to me the numbers with SD should > have been much better, but they in fact were not. > > Running that thing, mainline's GUI was not usable, even with my patch, > but neither was it usable with SD. What's the difference between > horrible with mainline and merely terrible with SD? In both, the GUI > ends up doing round-robin with a slew of hogs. In mainline, this > happens because the history logic can and does get it wrong sometimes, > which this exploit deliberately triggers. With SD, it's by design. The much maligned history mechanism in mainline didn't start it's life as an interactivity estimator, that's a name it acquired later. What it was first put there for was to ensure fairness for sleeping tasks. I found it most ironic that the numbers I posted showed that mechanism working perfectly, with an exploit that was designed specifically to expose it's weakness, despite the deliberate tweaks that have gone in tweaking it very heavily in the unfair direction, and this went uncommented. If I had run more of them, it would have shown that weakness very well. We all know that weakness exists. What the numbers clearly showed was that sleeping tasks did not get the fairness RSDL advertised with the particular test I ran, yet it went uncommented/uncontested. Anyone could have tested with the trivial proggy of their choice... but nobody did. The history mechanism is not only about interactivity, and never was. -Mike I'm gonna go piddle around with code now, much more fun than yacking :) ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-29 5:50 ` Mike Galbraith 2007-03-29 6:29 ` Mike Galbraith @ 2007-03-29 8:18 ` Mike Galbraith 2007-03-29 12:55 ` [ck] " michael chang 2007-04-03 2:35 ` Con Kolivas 2007-04-03 2:37 ` Con Kolivas 2 siblings, 2 replies; 92+ messages in thread From: Mike Galbraith @ 2007-03-29 8:18 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list Rereading to make sure I wasn't unclear anywhere... On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote: > > I don't see what a < 95% load really means. Egad. Here I'm pondering the numbers and light load as I'm typing, and my fingers (seemingly independent when mind wanders off) typed < 95% as in not fully committed, instead of "light". -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [ck] Re: [PATCH] sched: staircase deadline misc fixes 2007-03-29 8:18 ` Mike Galbraith @ 2007-03-29 12:55 ` michael chang 2007-04-03 2:35 ` Con Kolivas 1 sibling, 0 replies; 92+ messages in thread From: michael chang @ 2007-03-29 12:55 UTC (permalink / raw) To: Mike Galbraith; +Cc: Con Kolivas, ck list, linux list, Andrew Morton On 3/29/07, Mike Galbraith <efault@gmx.de> wrote: > Rereading to make sure I wasn't unclear anywhere... > > On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote: > > > > I don't see what a < 95% load really means. > > Egad. Here I'm pondering the numbers and light load as I'm typing, and > my fingers (seemingly independent when mind wanders off) typed < 95% as > in not fully committed, instead of "light". While I don't know the _exact_ figure for this, my hunch is that a good ballpark figure is anything that is not a heavy load (less than 4, perhaps even lower, maybe <0.75 or <2?) and that is not a "niced" load. -- -- Michael Chang ~Just the crazy copy cat~ ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-29 8:18 ` Mike Galbraith 2007-03-29 12:55 ` [ck] " michael chang @ 2007-04-03 2:35 ` Con Kolivas 1 sibling, 0 replies; 92+ messages in thread From: Con Kolivas @ 2007-04-03 2:35 UTC (permalink / raw) To: Mike Galbraith; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Thursday 29 March 2007 18:18, Mike Galbraith wrote: > Rereading to make sure I wasn't unclear anywhere... > > On Thu, 2007-03-29 at 07:50 +0200, Mike Galbraith wrote: > > I don't see what a < 95% load really means. > > Egad. Here I'm pondering the numbers and light load as I'm typing, and > my fingers (seemingly independent when mind wanders off) typed < 95% as > in not fully committed, instead of "light". 95% of cases where load is less than 4; not 95% load. -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-29 5:50 ` Mike Galbraith 2007-03-29 6:29 ` Mike Galbraith 2007-03-29 8:18 ` Mike Galbraith @ 2007-04-03 2:37 ` Con Kolivas 2007-04-03 5:31 ` Mike Galbraith 2 siblings, 1 reply; 92+ messages in thread From: Con Kolivas @ 2007-04-03 2:37 UTC (permalink / raw) To: Mike Galbraith; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list [-- Attachment #1: Type: text/plain, Size: 1059 bytes --] On Thursday 29 March 2007 15:50, Mike Galbraith wrote: > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote: > + * This contains a bitmap for each dynamic priority level with empty slots > + * for the valid priorities each different nice level can have. It allows > + * us to stagger the slots where differing priorities run in a way that > + * keeps latency differences between different nice levels at a minimum. > + * ie, where 0 means a slot for that priority, priority running from left > to + * right: > + * nice -20 0000000000000000000000000000000000000000 > + * nice -10 1001000100100010001001000100010010001000 > + * nice 0 0101010101010101010101010101010101010101 > + * nice 5 1101011010110101101011010110101101011011 > + * nice 10 0110111011011101110110111011101101110111 > + * nice 15 0111110111111011111101111101111110111111 > + * nice 19 1111111111111111111011111111111111111111 Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, and then SD. This is why you can't renice X on mainline. > -Mike -- -ck [-- Attachment #2: chew.c --] [-- Type: text/x-csrc, Size: 1027 bytes --] /* * orignal idea by Chris Friesen. Thanks. */ #include <stdio.h> #include <sys/time.h> #include <sys/resource.h> #define THRESHOLD_USEC 2000 unsigned long long stamp() { struct timeval tv; gettimeofday(&tv, 0); return (unsigned long long) tv.tv_usec + ((unsigned long long) tv.tv_sec)*1000000; } int main() { unsigned long long thresh_ticks = THRESHOLD_USEC; unsigned long long cur,last; struct timespec ts; sched_rr_get_interval(0, &ts); printf("pid %d, prio %3d, interval of %d nsec\n", getpid(), getpriority(PRIO_PROCESS, 0), ts.tv_nsec); last = stamp(); while(1) { cur = stamp(); unsigned long long delta = cur-last; if (delta > thresh_ticks) { printf("pid %d, prio %3d, out for %4llu ms\n", getpid(), getpriority(PRIO_PROCESS, 0), delta/1000); cur = stamp(); } last = cur; } return 0; } ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-03 2:37 ` Con Kolivas @ 2007-04-03 5:31 ` Mike Galbraith 2007-04-03 6:00 ` Mike Galbraith ` (2 more replies) 0 siblings, 3 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-03 5:31 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Tue, 2007-04-03 at 12:37 +1000, Con Kolivas wrote: > On Thursday 29 March 2007 15:50, Mike Galbraith wrote: > > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote: > > + * This contains a bitmap for each dynamic priority level with empty slots > > + * for the valid priorities each different nice level can have. It allows > > + * us to stagger the slots where differing priorities run in a way that > > + * keeps latency differences between different nice levels at a minimum. > > + * ie, where 0 means a slot for that priority, priority running from left > > to + * right: > > + * nice -20 0000000000000000000000000000000000000000 > > + * nice -10 1001000100100010001001000100010010001000 > > + * nice 0 0101010101010101010101010101010101010101 > > + * nice 5 1101011010110101101011010110101101011011 > > + * nice 10 0110111011011101110110111011101101110111 > > + * nice 15 0111110111111011111101111101111110111111 > > + * nice 19 1111111111111111111011111111111111111111 > > Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, > and then SD. This is why you can't renice X on mainline. How about something more challenging instead :) The numbers below are from my scheduler tree with massive_intr running at nice 0, and chew at nice 5. Below these numbers are 100 lines from the exact center of chew's output. (interactivity remains intact with this rather heavy load) root@Homer: ./massive_intr 30 180 005671 00001506 005657 00001506 005651 00001491 005647 00001466 005661 00001484 005660 00001475 005645 00001514 005668 00001384 005673 00001516 005656 00001449 005664 00001512 005659 00001507 005667 00001513 005663 00001521 005670 00001440 005649 00001522 005652 00001487 005648 00001405 005665 00001472 005669 00001418 005662 00001489 005674 00001523 005650 00001480 005655 00001476 005672 00001530 005653 00001463 005654 00001427 005646 00001499 005658 00001510 005666 00001476 100 sequential lines from the middle of chew's logged output. pid 5642, prio 5, out for 2 ms, ran for 1 ms, load 34% pid 5642, prio 5, out for 1268 ms, ran for 63 ms, load 4% pid 5642, prio 5, out for 52 ms, ran for 0 ms, load 0% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 12% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 12% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 18% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 18% pid 5642, prio 5, out for 4 ms, ran for 1 ms, load 22% pid 5642, prio 5, out for 1395 ms, ran for 50 ms, load 3% pid 5642, prio 5, out for 26 ms, ran for 0 ms, load 3% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 13% pid 5642, prio 5, out for 7 ms, ran for 0 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 20% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 13% pid 5642, prio 5, out for 1400 ms, ran for 53 ms, load 3% pid 5642, prio 5, out for 22 ms, ran for 1 ms, load 6% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 18% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 2 ms, ran for 1 ms, load 49% pid 5642, prio 5, out for 1281 ms, ran for 50 ms, load 3% pid 5642, prio 5, out for 50 ms, ran for 0 ms, load 1% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 13% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 12% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 4 ms, ran for 1 ms, load 31% pid 5642, prio 5, out for 1248 ms, ran for 53 ms, load 4% pid 5642, prio 5, out for 44 ms, ran for 0 ms, load 1% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 13% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 1311 ms, ran for 55 ms, load 4% pid 5642, prio 5, out for 121 ms, ran for 0 ms, load 0% pid 5642, prio 5, out for 22 ms, ran for 0 ms, load 1% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 18% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 12% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 16% pid 5642, prio 5, out for 6 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 1289 ms, ran for 50 ms, load 3% pid 5642, prio 5, out for 38 ms, ran for 0 ms, load 1% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 12% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 14% pid 5642, prio 5, out for 9 ms, ran for 1 ms, load 11% pid 5642, prio 5, out for 6 ms, ran for 1 ms, load 22% pid 5642, prio 5, out for 1348 ms, ran for 53 ms, load 3% pid 5642, prio 5, out for 8 ms, ran for 0 ms, load 10% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 15% pid 5642, prio 5, out for 7 ms, ran for 0 ms, load 11% pid 5642, prio 5, out for 8 ms, ran for 1 ms, load 12% pid 5642, prio 5, out for 1385 ms, ran for 65 ms, load 4% pid 5642, prio 5, out for 1385 ms, ran for 74 ms, load 5% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 5642, prio 5, out for 6 ms, ran for 1 ms, load 20% pid 5642, prio 5, out for 1375 ms, ran for 66 ms, load 4% ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-03 5:31 ` Mike Galbraith @ 2007-04-03 6:00 ` Mike Galbraith 2007-04-03 6:01 ` Ingo Molnar 2007-04-03 10:57 ` [PATCH] sched: staircase deadline misc fixes Mike Galbraith 2 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-03 6:00 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Tue, 2007-04-03 at 07:31 +0200, Mike Galbraith wrote: > On Tue, 2007-04-03 at 12:37 +1000, Con Kolivas wrote: > > On Thursday 29 March 2007 15:50, Mike Galbraith wrote: > > > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote: > > > + * This contains a bitmap for each dynamic priority level with empty slots > > > + * for the valid priorities each different nice level can have. It allows > > > + * us to stagger the slots where differing priorities run in a way that > > > + * keeps latency differences between different nice levels at a minimum. > > > + * ie, where 0 means a slot for that priority, priority running from left > > > to + * right: > > > + * nice -20 0000000000000000000000000000000000000000 > > > + * nice -10 1001000100100010001001000100010010001000 > > > + * nice 0 0101010101010101010101010101010101010101 > > > + * nice 5 1101011010110101101011010110101101011011 > > > + * nice 10 0110111011011101110110111011101101110111 > > > + * nice 15 0111110111111011111101111101111110111111 > > > + * nice 19 1111111111111111111011111111111111111111 > > > > Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, > > and then SD. This is why you can't renice X on mainline. > > How about something more challenging instead :) > > The numbers below are from my scheduler tree with massive_intr running > at nice 0, and chew at nice 5. Below these numbers are 100 lines from > the exact center of chew's output. > > (interactivity remains intact with this rather heavy load) Here are the numbers for 2.6.21-rc5 with only the earlier mentioned patch. Chew's log is only 20% as long as that from my other tree, and interactivity suffers badly while running this exploit, but as you can see, chew isn't dying of boredom. -Mike root@Homer: ./massive_intr 30 180 006701 00001509 006693 00001571 006707 00001072 006690 00001582 006691 00001547 006692 00001336 006695 00001759 006710 00001766 006699 00001531 006688 00001405 006709 00001907 006703 00001572 006705 00001501 006697 00001617 006686 00001344 006713 00001922 006714 00001885 006704 00001491 006694 00001482 006689 00001395 006711 00001176 006715 00001471 006708 00001527 006687 00001200 006706 00001451 006698 00001246 006702 00001495 006696 00001421 006712 00001414 006700 00001047 pid 6683, prio 5, out for 46 ms, ran for 0 ms, load 0% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 6683, prio 5, out for 6 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 3527 ms, ran for 69 ms, load 1% pid 6683, prio 5, out for 52 ms, ran for 1 ms, load 2% pid 6683, prio 5, out for 15 ms, ran for 1 ms, load 6% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 15% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 13% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 17% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 6683, prio 5, out for 3925 ms, ran for 56 ms, load 1% pid 6683, prio 5, out for 30 ms, ran for 1 ms, load 3% pid 6683, prio 5, out for 24 ms, ran for 1 ms, load 6% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 11% pid 6683, prio 5, out for 5 ms, ran for 0 ms, load 16% pid 6683, prio 5, out for 376 ms, ran for 54 ms, load 12% pid 6683, prio 5, out for 3320 ms, ran for 9 ms, load 0% pid 6683, prio 5, out for 3895 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 6683, prio 5, out for 3 ms, ran for 1 ms, load 26% pid 6683, prio 5, out for 3364 ms, ran for 68 ms, load 2% pid 6683, prio 5, out for 4676 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 3726 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 3223 ms, ran for 74 ms, load 2% pid 6683, prio 5, out for 7 ms, ran for 0 ms, load 4% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 13% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 20% pid 6683, prio 5, out for 9 ms, ran for 1 ms, load 12% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 6683, prio 5, out for 3562 ms, ran for 67 ms, load 1% pid 6683, prio 5, out for 4372 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 6831 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 756 ms, ran for 74 ms, load 9% pid 6683, prio 5, out for 27 ms, ran for 0 ms, load 1% pid 6683, prio 5, out for 4 ms, ran for 1 ms, load 20% pid 6683, prio 5, out for 3619 ms, ran for 71 ms, load 1% pid 6683, prio 5, out for 7 ms, ran for 0 ms, load 11% pid 6683, prio 5, out for 3 ms, ran for 1 ms, load 30% pid 6683, prio 5, out for 7 ms, ran for 34 ms, load 82% pid 6683, prio 5, out for 3 ms, ran for 1 ms, load 30% pid 6683, prio 5, out for 3182 ms, ran for 34 ms, load 1% pid 6683, prio 5, out for 4559 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 2937 ms, ran for 74 ms, load 2% pid 6683, prio 5, out for 19 ms, ran for 1 ms, load 8% pid 6683, prio 5, out for 3869 ms, ran for 72 ms, load 1% pid 6683, prio 5, out for 5 ms, ran for 0 ms, load 3% pid 6683, prio 5, out for 3375 ms, ran for 75 ms, load 2% pid 6683, prio 5, out for 4300 ms, ran for 74 ms, load 1% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 6683, prio 5, out for 3 ms, ran for 1 ms, load 31% pid 6683, prio 5, out for 5949 ms, ran for 72 ms, load 1% pid 6683, prio 5, out for 5314 ms, ran for 73 ms, load 1% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 14% pid 6683, prio 5, out for 9 ms, ran for 1 ms, load 14% pid 6683, prio 5, out for 3 ms, ran for 1 ms, load 34% pid 6683, prio 5, out for 4067 ms, ran for 70 ms, load 1% pid 6683, prio 5, out for 16 ms, ran for 7 ms, load 32% pid 6683, prio 5, out for 4149 ms, ran for 66 ms, load 1% pid 6683, prio 5, out for 3 ms, ran for 1 ms, load 27% pid 6683, prio 5, out for 2366 ms, ran for 72 ms, load 2% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 6683, prio 5, out for 7 ms, ran for 0 ms, load 10% pid 6683, prio 5, out for 1459 ms, ran for 73 ms, load 4% pid 6683, prio 5, out for 3121 ms, ran for 74 ms, load 2% pid 6683, prio 5, out for 3070 ms, ran for 74 ms, load 2% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 16% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 11% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 12% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 17% pid 6683, prio 5, out for 1303 ms, ran for 66 ms, load 4% pid 6683, prio 5, out for 10 ms, ran for 1 ms, load 10% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 16% pid 6683, prio 5, out for 5 ms, ran for 1 ms, load 17% pid 6683, prio 5, out for 2350 ms, ran for 68 ms, load 2% pid 6683, prio 5, out for 5 ms, ran for 0 ms, load 15% pid 6683, prio 5, out for 3242 ms, ran for 75 ms, load 2% pid 6683, prio 5, out for 2684 ms, ran for 74 ms, load 2% pid 6683, prio 5, out for 4941 ms, ran for 75 ms, load 1% pid 6683, prio 5, out for 1119 ms, ran for 74 ms, load 6% pid 6683, prio 5, out for 8 ms, ran for 0 ms, load 10% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 19% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 5 ms, ran for 1 ms, load 17% pid 6683, prio 5, out for 3701 ms, ran for 67 ms, load 1% pid 6683, prio 5, out for 2 ms, ran for 1 ms, load 43% pid 6683, prio 5, out for 3486 ms, ran for 72 ms, load 2% pid 6683, prio 5, out for 8 ms, ran for 0 ms, load 5% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 20% pid 6683, prio 5, out for 5 ms, ran for 1 ms, load 24% pid 6683, prio 5, out for 5413 ms, ran for 69 ms, load 1% pid 6683, prio 5, out for 2251 ms, ran for 74 ms, load 3% pid 6683, prio 5, out for 8 ms, ran for 1 ms, load 18% pid 6683, prio 5, out for 7 ms, ran for 1 ms, load 20% pid 6683, prio 5, out for 5 ms, ran for 1 ms, load 20% ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-03 5:31 ` Mike Galbraith 2007-04-03 6:00 ` Mike Galbraith @ 2007-04-03 6:01 ` Ingo Molnar 2007-04-03 6:11 ` Mike Galbraith 2007-04-05 11:02 ` Mike Galbraith 2007-04-03 10:57 ` [PATCH] sched: staircase deadline misc fixes Mike Galbraith 2 siblings, 2 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-03 6:01 UTC (permalink / raw) To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list * Mike Galbraith <efault@gmx.de> wrote: > > Try two instances of chew.c at _differing_ nice levels on one cpu on > > mainline, and then SD. This is why you can't renice X on mainline. > > How about something more challenging instead :) > > The numbers below are from my scheduler tree with massive_intr running > at nice 0, and chew at nice 5. Below these numbers are 100 lines from > the exact center of chew's output. > > (interactivity remains intact with this rather heavy load) looks interesting - could you send the patch? Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-03 6:01 ` Ingo Molnar @ 2007-04-03 6:11 ` Mike Galbraith 2007-04-05 11:02 ` Mike Galbraith 1 sibling, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-03 6:11 UTC (permalink / raw) To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote: > * Mike Galbraith <efault@gmx.de> wrote: > > > > Try two instances of chew.c at _differing_ nice levels on one cpu on > > > mainline, and then SD. This is why you can't renice X on mainline. > > > > How about something more challenging instead :) > > > > The numbers below are from my scheduler tree with massive_intr running > > at nice 0, and chew at nice 5. Below these numbers are 100 lines from > > the exact center of chew's output. > > > > (interactivity remains intact with this rather heavy load) > > looks interesting - could you send the patch? Sorry, that tree is not _even_ ready for viewing yet. (and it's got an occasional oops bug i have to kill) -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-03 6:01 ` Ingo Molnar 2007-04-03 6:11 ` Mike Galbraith @ 2007-04-05 11:02 ` Mike Galbraith 2007-04-05 11:09 ` Ingo Molnar 2007-04-05 11:54 ` [test] sched: SD-latest versus Mike's latest Ingo Molnar 1 sibling, 2 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-05 11:02 UTC (permalink / raw) To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote: > looks interesting - could you send the patch? Ok, this is looking/feeling pretty good in testing. Comments on fugliness etc much appreciated. Below the numbers is a snapshot of my experimental tree. It's a mixture of my old throttling/anti-starvation tree and the task promotion patch, with the addition of a scheduling class for interactive tasks to dish out some of that targeted unfairness I mentioned. SCHED_INTERACTIVE is also targeted at the scenario where X or one of it's clients uses enough CPU to end up in the expired array. (note: Xorg was not set SCHED_INTERACTIVE during the test runs below) -Mike top - 12:31:34 up 16 min, 13 users, load average: 7.37, 8.74, 6.58 PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ P COMMAND 6542 root 15 0 1568 108 24 S 43 0.0 0:58.98 1 fiftypercent 6540 root 17 0 1568 440 356 R 30 0.0 1:00.04 0 fiftypercent 6544 root 18 0 1568 108 24 R 28 0.0 0:58.36 0 fiftypercent 6541 root 20 0 1568 108 24 R 26 0.0 0:57.70 1 fiftypercent 6536 root 25 0 1436 356 296 R 24 0.0 0:45.76 1 chew 6538 root 25 0 1436 356 296 R 20 0.0 0:49.73 0 chew 6543 root 19 0 1568 108 24 R 19 0.0 0:58.04 1 fiftypercent 6409 root 15 0 154m 63m 27m R 2 6.3 0:13.09 0 amarokapp 6410 root 15 0 154m 63m 27m S 2 6.3 0:14.36 0 amarokapp 6376 root 15 0 2380 1092 764 R 2 0.1 0:15.63 0 top 5591 root 18 0 4736 1036 736 S 1 0.1 0:00.14 1 smpppd 5678 root 15 0 167m 24m 4848 S 1 2.4 0:19.37 0 Xorg 6202 root 15 0 32364 18m 12m S 1 1.8 0:04.25 1 konsole 50 lines from center of chew nailed to cpu0's log pid 6538, prio 0, out for 27 ms, ran for 1 ms, load 6% pid 6538, prio 0, out for 26 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 27 ms, ran for 7 ms, load 20% pid 6538, prio 0, out for 13 ms, ran for 5 ms, load 27% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 49% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 9 ms, ran for 6 ms, load 42% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 46% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 48% pid 6538, prio 0, out for 9 ms, ran for 27 ms, load 74% pid 6538, prio 0, out for 27 ms, ran for 4 ms, load 13% pid 6538, prio 0, out for 26 ms, ran for 5 ms, load 17% pid 6538, prio 0, out for 27 ms, ran for 5 ms, load 17% pid 6538, prio 0, out for 28 ms, ran for 6 ms, load 18% pid 6538, prio 0, out for 30 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 18 ms, ran for 5 ms, load 24% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 45% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 45% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 44% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 2 ms, ran for 7 ms, load 78% pid 6538, prio 0, out for 45 ms, ran for 22 ms, load 33% pid 6538, prio 0, out for 31 ms, ran for 2 ms, load 7% pid 6538, prio 0, out for 62 ms, ran for 1 ms, load 3% pid 6538, prio 0, out for 29 ms, ran for 3 ms, load 11% pid 6538, prio 0, out for 26 ms, ran for 4 ms, load 13% pid 6538, prio 0, out for 134 ms, ran for 5 ms, load 4% pid 6538, prio 0, out for 78 ms, ran for 2 ms, load 3% pid 6538, prio 0, out for 9 ms, ran for 3 ms, load 28% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 48% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 46% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 9 ms, ran for 6 ms, load 39% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 46% pid 6538, prio 0, out for 14 ms, ran for 6 ms, load 30% pid 6538, prio 0, out for 27 ms, ran for 3 ms, load 12% pid 6538, prio 0, out for 29 ms, ran for 4 ms, load 12% pid 6538, prio 0, out for 29 ms, ran for 4 ms, load 13% pid 6538, prio 0, out for 26 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 29 ms, ran for 5 ms, load 14% pid 6538, prio 0, out for 27 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 26 ms, ran for 5 ms, load 16% pid 6538, prio 0, out for 24 ms, ran for 6 ms, load 20% pid 6538, prio 0, out for 7 ms, ran for 7 ms, load 49% root@Homer: ./massive_intr 30 180 006502 00002373 006495 00002687 006518 00002417 006490 00002544 006500 00002417 006494 00002427 006498 00003032 006517 00003060 006505 00002401 006507 00002375 006514 00002398 006497 00002483 006506 00002388 006504 00002415 006510 00002472 006516 00002365 006509 00002441 006503 00002498 006512 00002930 006496 00002565 006492 00002389 006501 00002337 006508 00002395 006491 00002486 006499 00002394 006493 00002667 006515 00002569 006511 00002555 006513 00002637 006519 00002556 --- linux-2.6.21-rc5-x/include/linux/sched.h.org 2007-03-30 05:08:47.000000000 +0200 +++ linux-2.6.21-rc5-x/include/linux/sched.h 2007-04-02 08:17:30.000000000 +0200 @@ -34,6 +34,7 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#define SCHED_INTERACTIVE 4 #ifdef __KERNEL__ @@ -528,7 +529,7 @@ struct signal_struct { #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) +#define is_rt_policy(p) ((p) == SCHED_RR || (p) == SCHED_FIFO) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) /* @@ -820,14 +821,14 @@ struct task_struct { #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif - unsigned long sleep_avg; + unsigned long sleep_avg, last_slice, throttle; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ enum sleep_type sleep_type; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int time_slice, slice_info; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; --- linux-2.6.21-rc5-x/include/linux/sysctl.h.org 2007-03-31 12:52:52.000000000 +0200 +++ linux-2.6.21-rc5-x/include/linux/sysctl.h 2007-04-01 08:04:02.000000000 +0200 @@ -165,6 +165,8 @@ enum KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_SCHED_THROTTLE1=77, /* int: throttling credit period 1 in secs */ + KERN_SCHED_THROTTLE2=78, /* int: throttling credit period 2 in secs */ }; --- linux-2.6.21-rc5-x/kernel/sched.c.org 2007-03-27 15:47:49.000000000 +0200 +++ linux-2.6.21-rc5-x/kernel/sched.c 2007-04-05 12:06:38.000000000 +0200 @@ -90,6 +90,20 @@ unsigned long long __attribute__((weak)) #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -109,6 +123,8 @@ unsigned long long __attribute__((weak)) #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define PCNT_PER_DYNPRIO (100 / MAX_BONUS) +#define INTERACTIVE_LIMIT (DEF_TIMESLICE * 4) /* * If a task is 'interactive' then we reinsert it in the active @@ -167,6 +183,133 @@ unsigned long long __attribute__((weak)) (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +#define INTERACTIVE_LIMIT_EXCEEDED(rq) \ + ((rq)->active->interactive_ticks + (rq)->expired->interactive_ticks > \ + INTERACTIVE_LIMIT) + +/* + * Interactive boost can lead to starvation if the decision to + * boost a task turns out to be a bad one. To combat this, we + * compute the sane upper limit for cpu usage 'slice_avg' based + * upon a task's sleep_avg, and use this information combined + * with a timer to determine when intervention is required. + * + * When a task is behaving as it's sleep_avg indicates it should, + * it's throttle is moved forward, otherwise it will timeout, and + * it's priority will be lowered. + * + * Throttling tunables. + * + * CREDIT_C1: The amount of cpu time in seconds that a new task + * will run completely free, ie the head start a task + * has before it has to push it's timer forward to avoid + * being throttled. Each conforming slice thereafter + * increases it's stored credit, and vice versa. + * + * CREDIT_C2: The maximum amount of CPU time in seconds a task + * can store for later use. When a task has no stored + * credit left, now is time C2. Tasks begin life with + * C1 seconds credit, ie C2 is C1 seconds in front of + * them, and the 'buffer' will grow in front of them + * if they perform in a conformant manner. The maximum + * credit that fits in 32 bits jiffies is 42949 seconds. + */ + +int credit_c1 = 0; +int credit_c2 = 14400; +int credit_max = 42949; + +#define C1 (credit_c1 * MAX_BONUS * HZ) +#define C2 (credit_c2 * MAX_BONUS * HZ + C1) +#define C3 (MAX_BONUS * C2) + +#define credit_exhausted(p, credit) \ + (time_after_eq(jiffies, (p)->throttle + (credit))) + +/* + * Masks for p->slice_info, formerly p->first_time_slice. + * SLICE_FTS: 0x80000000 Task is in it's first ever timeslice. + * SLICE_NEW: 0x40000000 Slice refreshed. + * SLICE_INT: 0x20000000 Task is a SCHED_INTERACTIVE task partner. + * SLICE_SPA: 0x1FFE0000 Spare bits. + * SLICE_LTS: 0x0001FF80 Last time slice + * SLICE_AVG: 0x0000007F Task slice_avg stored as percentage. + */ +#define SLICE_AVG_BITS 7 +#define SLICE_LTS_BITS 10 +#define SLICE_SPA_BITS 12 +#define SLICE_INT_BITS 1 +#define SLICE_NEW_BITS 1 +#define SLICE_FTS_BITS 1 + +#define SLICE_AVG_SHIFT 0 +#define SLICE_LTS_SHIFT (SLICE_AVG_SHIFT + SLICE_AVG_BITS) +#define SLICE_SPA_SHIFT (SLICE_LTS_SHIFT + SLICE_LTS_BITS) +#define SLICE_INT_SHIFT (SLICE_SPA_SHIFT + SLICE_SPA_BITS) +#define SLICE_NEW_SHIFT (SLICE_INT_SHIFT + SLICE_INT_BITS) +#define SLICE_FTS_SHIFT (SLICE_NEW_SHIFT + SLICE_NEW_BITS) + +#define INFO_MASK(x) ((1U << (x))-1) +#define SLICE_AVG_MASK (INFO_MASK(SLICE_AVG_BITS) << SLICE_AVG_SHIFT) +#define SLICE_LTS_MASK (INFO_MASK(SLICE_LTS_BITS) << SLICE_LTS_SHIFT) +#define SLICE_SPA_MASK (INFO_MASK(SLICE_SPA_BITS) << SLICE_SPA_SHIFT) +#define SLICE_INT_MASK (INFO_MASK(SLICE_INT_BITS) << SLICE_INT_SHIFT) +#define SLICE_NEW_MASK (INFO_MASK(SLICE_NEW_BITS) << SLICE_NEW_SHIFT) +#define SLICE_FTS_MASK (INFO_MASK(SLICE_FTS_BITS) << SLICE_FTS_SHIFT) + +/* p->slice_info access macros. */ +#define first_time_slice(p) ((p)->slice_info & SLICE_FTS_MASK) +#define set_first_time_slice(p) ((p)->slice_info |= SLICE_FTS_MASK) +#define clr_first_time_slice(p) ((p)->slice_info &= ~SLICE_FTS_MASK) + +#define slice_is_new(p) ((p)->slice_info & SLICE_NEW_MASK) +#define set_slice_is_new(p) ((p)->slice_info |= SLICE_NEW_MASK) +#define clr_slice_is_new(p) ((p)->slice_info &= ~SLICE_NEW_MASK) + +#define task_is_interactive(p) ((p)->slice_info & SLICE_INT_MASK) +#define set_task_is_interactive(p) ((p)->slice_info |= SLICE_INT_MASK) +#define clr_task_is_interactive(p) ((p)->slice_info &= ~SLICE_INT_MASK) + +#define last_slice(p) (((p)->slice_info & SLICE_LTS_MASK) >> SLICE_LTS_SHIFT) +#define set_last_slice(p, n) ((p)->slice_info = (((p)->slice_info & \ + ~SLICE_LTS_MASK) | (((n) << SLICE_LTS_SHIFT) & SLICE_LTS_MASK))) + +#define NS_SLEEP_AVG_PCNT (NS_MAX_SLEEP_AVG / 100) + +/* Note: raw storage format of slice_avg is %cpu. */ +#define slice_avg(p) ((typeof((p)->sleep_avg)) \ + ((((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) * \ + NS_SLEEP_AVG_PCNT)) +#define set_slice_avg(p, n) ((p)->slice_info = (((p)->slice_info & \ + ~SLICE_AVG_MASK) | ((((n) / NS_SLEEP_AVG_PCNT) \ + << SLICE_AVG_SHIFT) & SLICE_AVG_MASK))) +#define slice_avg_raw(p) \ + (((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) +#define set_slice_avg_raw(p, n) ((p)->slice_info = (((p)->slice_info & \ + ~SLICE_AVG_MASK) | (((n) << SLICE_AVG_SHIFT) & SLICE_AVG_MASK))) + +/* cpu usage macros. */ +#define cpu_avg(p) \ + (100 - slice_avg_raw(p)) + +#define cpu_max(p) \ + (100 - ((p)->sleep_avg / NS_SLEEP_AVG_PCNT)) + +#define time_this_slice(p) \ + (jiffies - (p)->last_slice) + +#define cpu_this_slice(p) \ + (100 * last_slice(p) / max((unsigned) time_this_slice(p), \ + (unsigned) last_slice(p))) + +#define cpu_avg_rq(rq) \ + (100 * DEF_TIMESLICE / max((unsigned) (rq)->slice_avg, \ + (unsigned) DEF_TIMESLICE)) + +/* Positively identified interactive tasks. */ +#define task_interactive(p) \ + ((p)->policy == SCHED_INTERACTIVE || task_is_interactive(p)) + #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) @@ -201,6 +344,7 @@ static inline unsigned int task_timeslic struct prio_array { unsigned int nr_active; + int interactive_ticks; DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_PRIO]; }; @@ -234,7 +378,8 @@ struct rq { */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; + unsigned long switch_timestamp; + unsigned long slice_avg; /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; @@ -691,6 +836,8 @@ static void dequeue_task(struct task_str list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + if (TASK_INTERACTIVE(p)) + array->interactive_ticks -= p->time_slice; } static void enqueue_task(struct task_struct *p, struct prio_array *array) @@ -700,6 +847,8 @@ static void enqueue_task(struct task_str __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + if (TASK_INTERACTIVE(p)) + array->interactive_ticks += p->time_slice; } /* @@ -882,7 +1031,11 @@ static int recalc_task_prio(struct task_ /* Caller must always ensure 'now >= p->timestamp' */ unsigned long sleep_time = now - p->timestamp; - if (batch_task(p)) + /* + * Migration timestamp adjustment may induce negative time. + * Ignore unquantifiable values as well as SCHED_BATCH tasks. + */ + if (now < p->timestamp || batch_task(p)) sleep_time = 0; if (likely(sleep_time > 0)) { @@ -893,7 +1046,14 @@ static int recalc_task_prio(struct task_ */ unsigned long ceiling = INTERACTIVE_SLEEP(p); - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { + /* + * Update throttle position. + */ + p->throttle += NS64_TO_JIFFIES(sleep_time); + if (time_before(jiffies, p->throttle)) + p->throttle = jiffies; + + if (sleep_time > ceiling && p->sleep_avg < ceiling) { /* * Prevents user tasks from achieving best priority * with one single large enough sleep. @@ -915,7 +1075,7 @@ static int recalc_task_prio(struct task_ * limited in their sleep_avg rise as they * are likely to be waiting on I/O */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { + if (p->sleep_type == SLEEP_NONINTERACTIVE) { if (p->sleep_avg >= ceiling) sleep_time = 0; else if (p->sleep_avg + sleep_time >= @@ -1531,16 +1691,23 @@ out_activate: * sleep_avg beyond just interactive state. */ p->sleep_type = SLEEP_NONINTERACTIVE; - } else + } else if (task_interactive(current)) { + /* + * Tasks tagged as being truly interactive + * pass temporary interactive status on to + * the task they are waking. + */ + set_task_is_interactive(p); + p->sleep_type = SLEEP_INTERACTIVE; + } /* * Tasks that have marked their sleep as noninteractive get * woken up with their sleep average not weighted in an * interactive way. */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - + else if (old_state & TASK_NONINTERACTIVE) + p->sleep_type = SLEEP_NONINTERACTIVE; activate_task(p, rq, cpu == this_cpu); /* @@ -1628,9 +1795,24 @@ void fastcall sched_fork(struct task_str * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ - p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); + + /* + * Set up slice_info and initial throttle position for the child. + */ + set_slice_avg(p, p->sleep_avg); + set_last_slice(p, p->time_slice); + set_slice_is_new(p); + set_first_time_slice(p); + p->last_slice = jiffies; + p->throttle = jiffies - C2 + C1; + /* + * SCHED_INTERACTIVE policy cannot be inherited. + */ + if (unlikely(current->policy == SCHED_INTERACTIVE)) + p->policy = SCHED_NORMAL; + if (unlikely(!current->time_slice)) { /* * This case is rare, it happens when the parent has only @@ -1745,7 +1927,7 @@ void fastcall sched_exit(struct task_str * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + if (first_time_slice(p) && task_cpu(p) == task_cpu(p->parent)) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > task_timeslice(p))) p->parent->time_slice = task_timeslice(p); @@ -3051,9 +3233,10 @@ static inline int expired_starving(struc { if (rq->curr->static_prio > rq->best_expired_prio) return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) + if (!STARVATION_LIMIT) return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) + if (jiffies - rq->switch_timestamp > rq->nr_running * DEF_TIMESLICE + + STARVATION_LIMIT) return 1; return 0; } @@ -3131,8 +3314,165 @@ void account_steal_time(struct task_stru cpustat->steal = cputime64_add(cpustat->steal, tmp); } +/* + * Promote and requeue the next lower priority task. If no task + * is available in the active array, switch to the expired array. + * @rq: runqueue to search. + * @prio: priority at which to begin search. + */ +static inline void promote_next_lower(struct rq *rq, int prio) +{ + struct prio_array *array = rq->active; + struct task_struct *p = NULL; + unsigned long long now = rq->most_recent_timestamp; + unsigned long *bitmap; + unsigned long starving = JIFFIES_TO_NS(rq->slice_avg); + int idx = prio + 1, found_noninteractive = 0; + int ticks = rq->active->interactive_ticks + rq->expired->interactive_ticks; + +repeat: + bitmap = array->bitmap; + idx = find_next_bit(bitmap, MAX_PRIO, idx); + if (idx < MAX_PRIO) { + struct list_head *queue = array->queue + idx; + + p = list_entry(queue->next, struct task_struct, run_list); + if (!TASK_INTERACTIVE(p)) + found_noninteractive = 1; + + /* Skip non-starved queues. */ + if (now < p->last_ran + starving) { + idx++; + p = NULL; + goto repeat; + } + } else if (!found_noninteractive && array == rq->active) { + /* Nobody home, check the expired array. */ + array = rq->expired; + idx = prio; + p = NULL; + goto repeat; + } + + /* Found one, requeue it. */ + if (p) { + dequeue_task(p, p->array); + if (array == rq->active) + p->prio--; + /* + * If we pulled a task from the expired array, correct + * expired array info. We can't afford a full search + * for best_expired_prio, but do the best we can. + */ + else { + idx = sched_find_first_bit(array->bitmap); + if (idx < MAX_PRIO) { + if (rq->best_expired_prio > idx) + rq->best_expired_prio = idx; + } else { + /* We emptied the array */ + rq->best_expired_prio = MAX_PRIO; + /* + * If we have excessive interactive load, + * do not inhibit forced array switching. + */ + if (ticks < INTERACTIVE_LIMIT) + rq->switch_timestamp = jiffies; + } + } + enqueue_task(p, rq->active); + } +} + +/* + * Refresh timeslice and associated slice information. + * @p: the process to refresh. + */ +static void refresh_timeslice(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + unsigned long slice_time = jiffies - p->last_slice; + int idle, cpu, cpu_avg, slice = last_slice(p); + int w = MAX_BONUS, delta, bonus; + + if (unlikely(slice_time < slice)) + slice_time = slice; + + /* Update task's CPU usage. */ + cpu_avg = slice_avg_raw(p); + cpu = cpu_this_slice(p); + idle = 100 - cpu; + delta = max(cpu_avg, idle) - min(cpu_avg, idle); + w = 1 + (delta / w); + cpu_avg = (w * cpu_avg + idle) / (w + 1); + set_slice_avg_raw(p, cpu_avg); + + /* + * If we've hit the throttle timeout, we aren't draining enough + * sleep_avg to keep up with the task's cpu usage. Up the ante + * to bring the task back toward balance. + */ + if (credit_exhausted(p, C2) && p->sleep_avg > slice_avg(p)) { + unsigned long run_time = p->sleep_avg - slice_avg(p); + run_time /= w; + if (p->sleep_avg >= run_time) + p->sleep_avg -= run_time; + } + + /* + * Update throttle position and sanity check it. + */ + if (task_is_interactive(p)) + p->throttle += slice_time - slice; + else if (INTERACTIVE_LIMIT_EXCEEDED(rq) && + cpu_avg - cpu_avg_rq(rq) >= PCNT_PER_DYNPRIO) { + bonus = (cpu_avg - cpu_avg_rq(rq)) / PCNT_PER_DYNPRIO; + p->throttle -= slice_time * bonus; + } else if (cpu < cpu_max(p) + PCNT_PER_DYNPRIO) { + bonus = idle * PCNT_PER_DYNPRIO / 100; + p->throttle += (slice_time - slice) * bonus; + } else if (cpu >= cpu_max(p) + PCNT_PER_DYNPRIO) { + bonus = (cpu - cpu_max(p)) / PCNT_PER_DYNPRIO; + p->throttle -= slice_time * bonus; + } + + if (time_before(jiffies, p->throttle)) + p->throttle = jiffies; + else if (credit_exhausted(p, C3)) + p->throttle = jiffies - C3; + + /* Add our slice time to the runqueue average. */ + if (slice_time < HZ || slice_time < rq->nr_running * DEF_TIMESLICE) { + rq->slice_avg <<= 4; + rq->slice_avg += slice_time; + rq->slice_avg >>= 4; + } + + /* + * Ensure that SCHED_INTERACTIVE tasks and their partners will + * always be classified correctly by TASK_INTERACTIVE(). Clear + * propogated interactive task status. Propogated status is + * inherited from the parent, but is good for only one slice. + */ + if (task_is_interactive(p) && p->sleep_avg < INTERACTIVE_SLEEP(p)) + p->sleep_avg = INTERACTIVE_SLEEP(p); + clr_task_is_interactive(p); + + /* Update dynamic priority and time slice. */ + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + set_last_slice(p, p->time_slice); + + /* And finally, stamp and flag the new slice. */ + clr_first_time_slice(p); + set_slice_is_new(p); + p->last_slice = jiffies; +} + static void task_running_tick(struct rq *rq, struct task_struct *p) { + int task_was_interactive; + if (p->array != rq->active) { /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); @@ -3152,8 +3492,7 @@ static void task_running_tick(struct rq * FIFO tasks have no timeslices. */ if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + refresh_timeslice(p); set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -3161,21 +3500,36 @@ static void task_running_tick(struct rq } goto out_unlock; } + + /* + * Tick off interactive task ticks from the active array. + */ + task_was_interactive = TASK_INTERACTIVE(p); + if (task_was_interactive && --rq->active->interactive_ticks < 0) + rq->active->interactive_ticks = 0; + if (!--p->time_slice) { dequeue_task(p, rq->active); + refresh_timeslice(p); set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { + + if (!TASK_INTERACTIVE(p) || expired_starving(rq) || + credit_exhausted(p, C2)) { enqueue_task(p, rq->expired); if (p->static_prio < rq->best_expired_prio) rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); + + /* + * Always look to see if any queue under you is starving, + * and requeue a task if that is the case. This prevents + * things like multiple tasks at any priority waking in + * streams and starving their less fortunate peers via + * preempt, ie ensures that the less fortunate will have + * bounded latency. + */ + promote_next_lower(rq, p->prio); } else { /* * Prevent a too long timeslice allowing a task to monopolize @@ -3285,7 +3639,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx, new_prio; + int cpu, idx, new_prio, throttle; long *switch_count; struct rq *rq; @@ -3332,9 +3686,13 @@ need_resched_nonpreemptible: /* * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); + * delay them losing their interactive status. If we have too many + * interactive ticks queued or this task is being throttled, switch + * behavior to linear decay. + */ + throttle = INTERACTIVE_LIMIT_EXCEEDED(rq) || credit_exhausted(prev, C2); + if (!throttle) + run_time /= 1 + CURRENT_BONUS(prev); spin_lock_irq(&rq->lock); @@ -3356,7 +3714,7 @@ need_resched_nonpreemptible: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; + rq->switch_timestamp = jiffies; goto switch_tasks; } } @@ -3370,7 +3728,8 @@ need_resched_nonpreemptible: rq->active = rq->expired; rq->expired = array; array = rq->active; - rq->expired_timestamp = 0; + array->interactive_ticks = 0; + rq->switch_timestamp = jiffies; rq->best_expired_prio = MAX_PRIO; } @@ -3380,6 +3739,8 @@ need_resched_nonpreemptible: if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; + int next_interactive = TASK_INTERACTIVE(next); + if (unlikely((long long)(now - next->timestamp) < 0)) delta = 0; @@ -3389,14 +3750,33 @@ need_resched_nonpreemptible: array = next->array; new_prio = recalc_task_prio(next, next->timestamp + delta); + /* + * If INTERACTIVE_LIMIT is exceeded, do not promote + * tasks which already have interactive status. This + * can only make things worse if the load isn't truly + * interactive, so let them decay. We also don't want + * a task which has been promoted while waiting to + * get CPU after wakeup to be demoted, and thus end + * up being preempted immediately by a task waking + * at the priority it has just reached. Tasks which + * miss the tick frequently also get caught here, so + * care has to be taken to not help them along. Since + * these are very likely to have interactive status, + * don't ever demote a non-interactive task here, and + * always considered interactive tasks to be fair game. + */ + if ((throttle && next_interactive && new_prio < next->prio) || + (!next_interactive && new_prio > next->prio)) + goto switch_tasks; + if (unlikely(next->prio != new_prio)) { dequeue_task(next, array); next->prio = new_prio; enqueue_task(next, array); } } - next->sleep_type = SLEEP_NORMAL; switch_tasks: + next->sleep_type = SLEEP_NORMAL; if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); @@ -3411,6 +3791,14 @@ switch_tasks: prev->sleep_avg = 0; prev->timestamp = prev->last_ran = now; + /* + * Tag start of execution of a new timeslice. + */ + if (unlikely(slice_is_new(next))) { + next->last_slice = jiffies; + clr_slice_is_new(next); + } + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = next->last_ran = now; @@ -4081,7 +4469,8 @@ recheck: if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH) + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_INTERACTIVE) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are @@ -4619,6 +5008,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_INTERACTIVE: ret = 0; break; } @@ -4643,6 +5033,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_INTERACTIVE: ret = 0; } return ret; @@ -6772,6 +7163,7 @@ void __init sched_init(void) rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; + rq->slice_avg = STARVATION_LIMIT; #ifdef CONFIG_SMP rq->sd = NULL; --- linux-2.6.21-rc5-x/kernel/sysctl.c.org 2007-03-31 12:54:06.000000000 +0200 +++ linux-2.6.21-rc5-x/kernel/sysctl.c 2007-04-01 08:04:02.000000000 +0200 @@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int credit_c1; +extern int credit_c2; +extern int credit_max; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -204,6 +207,13 @@ static ctl_table root_table[] = { { .ctl_name = 0 } }; +/* + * Constants for minimum and maximum testing in vm_table and + * kern_table. We use these as one-element integer vectors. +*/ +static int zero; +static int one_hundred = 100; + static ctl_table kern_table[] = { { .ctl_name = KERN_PANIC, @@ -603,16 +613,31 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - + { + .ctl_name = KERN_SCHED_THROTTLE1, + .procname = "credit_c1", + .data = &credit_c1, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &credit_max, + }, + { + .ctl_name = KERN_SCHED_THROTTLE2, + .procname = "credit_c2", + .data = &credit_c2, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &credit_max, + }, { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY, ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-05 11:02 ` Mike Galbraith @ 2007-04-05 11:09 ` Ingo Molnar 2007-04-05 11:12 ` Mike Galbraith 2007-04-05 11:54 ` [test] sched: SD-latest versus Mike's latest Ingo Molnar 1 sibling, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-05 11:09 UTC (permalink / raw) To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list find a whitespace fix below. Ingo Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -1034,7 +1034,7 @@ static int recalc_task_prio(struct task_ /* * Migration timestamp adjustment may induce negative time. * Ignore unquantifiable values as well as SCHED_BATCH tasks. - */ + */ if (now < p->timestamp || batch_task(p)) sleep_time = 0; ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-05 11:09 ` Ingo Molnar @ 2007-04-05 11:12 ` Mike Galbraith 2007-04-05 11:15 ` Ingo Molnar 2007-04-05 13:18 ` Johannes Stezenbach 0 siblings, 2 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-05 11:12 UTC (permalink / raw) To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list On Thu, 2007-04-05 at 13:09 +0200, Ingo Molnar wrote: > find a whitespace fix below. > > Ingo > > Index: linux/kernel/sched.c > =================================================================== > --- linux.orig/kernel/sched.c > +++ linux/kernel/sched.c > @@ -1034,7 +1034,7 @@ static int recalc_task_prio(struct task_ > /* > * Migration timestamp adjustment may induce negative time. > * Ignore unquantifiable values as well as SCHED_BATCH tasks. > - */ > + */ > if (now < p->timestamp || batch_task(p)) > sleep_time = 0; > Thanks. (dang, i need to find that fifty "make it red" thingie for vi again) -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-05 11:12 ` Mike Galbraith @ 2007-04-05 11:15 ` Ingo Molnar 2007-04-05 13:18 ` Johannes Stezenbach 1 sibling, 0 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-05 11:15 UTC (permalink / raw) To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list * Mike Galbraith <efault@gmx.de> wrote: > > - */ > > + */ > > if (now < p->timestamp || batch_task(p)) > > sleep_time = 0; > > > > Thanks. > > (dang, i need to find that fifty "make it red" thingie for vi again) or just start using quilt, which warns about this :) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-05 11:12 ` Mike Galbraith 2007-04-05 11:15 ` Ingo Molnar @ 2007-04-05 13:18 ` Johannes Stezenbach 2007-04-05 15:28 ` Mike Galbraith 1 sibling, 1 reply; 92+ messages in thread From: Johannes Stezenbach @ 2007-04-05 13:18 UTC (permalink / raw) To: Mike Galbraith Cc: Ingo Molnar, Con Kolivas, linux list, Andrew Morton, ck list On Thu, Apr 05, 2007, Mike Galbraith wrote: > > (dang, i need to find that fifty "make it red" thingie for vi again) put "let c_space_errors=1" in .vimrc HTH, Johannes ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-05 13:18 ` Johannes Stezenbach @ 2007-04-05 15:28 ` Mike Galbraith 0 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-05 15:28 UTC (permalink / raw) To: Johannes Stezenbach Cc: Ingo Molnar, Con Kolivas, linux list, Andrew Morton, ck list On Thu, 2007-04-05 at 15:18 +0200, Johannes Stezenbach wrote: > On Thu, Apr 05, 2007, Mike Galbraith wrote: > > > > (dang, i need to find that fifty "make it red" thingie for vi again) ^(spiffy;) > > put "let c_space_errors=1" in .vimrc Thanks. I received this link via private mail, and think it's worth posting. Who knows, it may save Maintainers an antacid tablet or two. http://www.pixelbeat.org/settings/.vimrc -Mike (may eventually get tired of the colors, but for now they're cooler than the plain black and white i'm used to, _and_ has "make it glow" feature) ^ permalink raw reply [flat|nested] 92+ messages in thread
* [test] sched: SD-latest versus Mike's latest 2007-04-05 11:02 ` Mike Galbraith 2007-04-05 11:09 ` Ingo Molnar @ 2007-04-05 11:54 ` Ingo Molnar 2007-04-05 12:10 ` Mike Galbraith ` (2 more replies) 1 sibling, 3 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-05 11:54 UTC (permalink / raw) To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list * Mike Galbraith <efault@gmx.de> wrote: > On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote: > > > looks interesting - could you send the patch? > > Ok, this is looking/feeling pretty good in testing. Comments on > fugliness etc much appreciated. > > Below the numbers is a snapshot of my experimental tree. It's a > mixture of my old throttling/anti-starvation tree and the task > promotion patch, with the addition of a scheduling class for > interactive tasks to dish out some of that targeted unfairness I > mentioned. here's some test results, comparing SD-latest to Mike's-latest: re-testing the weak points of the vanilla scheduler + Mike's: - thud.c: this workload has almost unnoticeable effect - fiftyp.c: noticeable, but alot better than previously! re-testing the weak points of SD: - hackbench: still unusable under such type of high load - no improvement. - make -j: still less interactive than Mike's - no improvement. Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [test] sched: SD-latest versus Mike's latest 2007-04-05 11:54 ` [test] sched: SD-latest versus Mike's latest Ingo Molnar @ 2007-04-05 12:10 ` Mike Galbraith 2007-04-05 12:12 ` Ingo Molnar 2007-04-05 16:08 ` Con Kolivas 2007-04-06 1:03 ` Ten percent test Con Kolivas 2 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-05 12:10 UTC (permalink / raw) To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list On Thu, 2007-04-05 at 13:54 +0200, Ingo Molnar wrote: > here's some test results, comparing SD-latest to Mike's-latest: > > re-testing the weak points of the vanilla scheduler + Mike's: > > - thud.c: this workload has almost unnoticeable effect > - fiftyp.c: noticeable, but alot better than previously! Hmm. Here fiftyp.c is utterly harmless. If you have a second, can you send me a top snapshot? If you're running many of them, it can take a bit for the throttle to catch them all. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [test] sched: SD-latest versus Mike's latest 2007-04-05 12:10 ` Mike Galbraith @ 2007-04-05 12:12 ` Ingo Molnar 2007-04-05 12:24 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-05 12:12 UTC (permalink / raw) To: Mike Galbraith; +Cc: Con Kolivas, linux list, Andrew Morton, ck list * Mike Galbraith <efault@gmx.de> wrote: > > re-testing the weak points of the vanilla scheduler + Mike's: > > > > - thud.c: this workload has almost unnoticeable effect > > - fiftyp.c: noticeable, but alot better than previously! > > Hmm. Here fiftyp.c is utterly harmless. If you have a second, can > you send me a top snapshot? If you're running many of them, it can > take a bit for the throttle to catch them all. ah, indeed - i ran 10 of them and letting them run for a bit smoothes things out. Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [test] sched: SD-latest versus Mike's latest 2007-04-05 12:12 ` Ingo Molnar @ 2007-04-05 12:24 ` Mike Galbraith 0 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-05 12:24 UTC (permalink / raw) To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list On Thu, 2007-04-05 at 14:12 +0200, Ingo Molnar wrote: > * Mike Galbraith <efault@gmx.de> wrote: > > > > re-testing the weak points of the vanilla scheduler + Mike's: > > > > > > - thud.c: this workload has almost unnoticeable effect > > > - fiftyp.c: noticeable, but alot better than previously! > > > > Hmm. Here fiftyp.c is utterly harmless. If you have a second, can > > you send me a top snapshot? If you're running many of them, it can > > take a bit for the throttle to catch them all. > > ah, indeed - i ran 10 of them and letting them run for a bit smoothes > things out. Ok, I didn't try 10 of them. It can still get a bit ragged here, so I may have to latch the throttle for a bit to make sure they have to maintain improved behavior to get unleashed. 5 of them get instantly nailed, and stay nailed. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [test] sched: SD-latest versus Mike's latest 2007-04-05 11:54 ` [test] sched: SD-latest versus Mike's latest Ingo Molnar 2007-04-05 12:10 ` Mike Galbraith @ 2007-04-05 16:08 ` Con Kolivas 2007-04-05 19:05 ` Ingo Molnar 2007-04-05 20:29 ` Mike Galbraith 2007-04-06 1:03 ` Ten percent test Con Kolivas 2 siblings, 2 replies; 92+ messages in thread From: Con Kolivas @ 2007-04-05 16:08 UTC (permalink / raw) To: Ingo Molnar; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list On Thursday 05 April 2007 21:54, Ingo Molnar wrote: > * Mike Galbraith <efault@gmx.de> wrote: > > On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote: > > > looks interesting - could you send the patch? > > > > Ok, this is looking/feeling pretty good in testing. Comments on > > fugliness etc much appreciated. > > > > Below the numbers is a snapshot of my experimental tree. It's a > > mixture of my old throttling/anti-starvation tree and the task Throttling to try to get to SD fairness? The mainline state machine becomes more complex than ever and fluctuates from interactive to fair by an as-yet unchosen magic number timeframe which ebbs and flows. > > promotion patch, with the addition of a scheduling class for > > interactive tasks to dish out some of that targeted unfairness I > > mentioned. Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New scheduling class just for X? Sounds like a very complicated userspace-changing way to just do the equivalent of "nice -n -10" obfuscated. > here's some test results, comparing SD-latest to Mike's-latest: > > re-testing the weak points of the vanilla scheduler + Mike's: > > - thud.c: this workload has almost unnoticeable effect > - fiftyp.c: noticeable, but alot better than previously! Load of 1.5 makes mainline a doorstop without throttling. > re-testing the weak points of SD: > > - hackbench: still unusable under such type of high load - no improvement. Load of 160. Is proportional slowdown bad? > - make -j: still less interactive than Mike's - no improvement. Depends on how big your job number vs cpu is. The better the throttling gets with mainline the better SD gets in this comparison. At equal fairness mainline does not have the low latency interactivity SD has. Nice -10 X with SD is a far better solution than an ever increasing complexity state machine and a userspace-changing scheduling policy just for X. Half decent graphics cards get good interactivity with SD even without renicing. > Ingo -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [test] sched: SD-latest versus Mike's latest 2007-04-05 16:08 ` Con Kolivas @ 2007-04-05 19:05 ` Ingo Molnar 2007-04-05 20:29 ` Mike Galbraith 1 sibling, 0 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-05 19:05 UTC (permalink / raw) To: Con Kolivas; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list * Con Kolivas <kernel@kolivas.org> wrote: > Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New > scheduling class just for X? Sounds like a very complicated > userspace-changing way to just do the equivalent of "nice -n -10" > obfuscated. i think you are missing the point. We _do not know in advance_ whether X should be prioritized or not. It's the behavior of X that determines it. When X is reniced to -10 it fixes a few corner cases, but it breaks many other cases. We found that out time and time again. btw., the tests i've done were not with X but using a shell prompt. > > re-testing the weak points of SD: > > > > - hackbench: still unusable under such type of high load - no > > improvement. > > Load of 160. Is proportional slowdown bad? this is relative to how mainline+Mike's handles it. Users wont really care about the why's, they'll only see the slowdown. > > - make -j: still less interactive than Mike's - no improvement. > > Depends on how big your job number vs cpu is. The better the > throttling gets with mainline the better SD gets in this comparison. > At equal fairness mainline does not have the low latency interactivity > SD has. i often run make jobs with -j200 or larger, and SD gets worse than even mainline much sooner than that. Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [test] sched: SD-latest versus Mike's latest 2007-04-05 16:08 ` Con Kolivas 2007-04-05 19:05 ` Ingo Molnar @ 2007-04-05 20:29 ` Mike Galbraith 1 sibling, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-05 20:29 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Fri, 2007-04-06 at 02:08 +1000, Con Kolivas wrote: > On Thursday 05 April 2007 21:54, Ingo Molnar wrote: > > * Mike Galbraith <efault@gmx.de> wrote: > > > On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote: > > > > looks interesting - could you send the patch? > > > > > > Ok, this is looking/feeling pretty good in testing. Comments on > > > fugliness etc much appreciated. > > > > > > Below the numbers is a snapshot of my experimental tree. It's a > > > mixture of my old throttling/anti-starvation tree and the task > > Throttling to try to get to SD fairness? The mainline state machine becomes > more complex than ever and fluctuates from interactive to fair by an as-yet > unchosen magic number timeframe which ebbs and flows. I believe I've already met and surpassed SD fairness. Bold statement, but I believe it's true. I'm more worried about becoming _too_ fair. Show me your numbers. I showed you mine with both SD and my patches. WRT magic and state machine complexity: If you read the patch, there is nothing "magical" about it. It doesn't do anything but monitor CPU usage and move a marker. It does nothing the least bit complicated, and what it does, it does in the slow path. The only thing it does in the fast path is to move the marker, and perhaps tag a targeted task. State machine? There is nothing there that resembles a state machine to me, the heuristic is just add sleep time, burn on use. > > > promotion patch, with the addition of a scheduling class for > > > interactive tasks to dish out some of that targeted unfairness I > > > mentioned. > > Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New > scheduling class just for X? Sounds like a very complicated > userspace-changing way to just do the equivalent of "nice -n -10" obfuscated. This patch makes massive nice -10 vs nice 0 latency history I believe. Testing welcome. WRT "nice -10 obfuscated", that's a load of high grade horse-hockey. There were very good reason posted here as to why that is a very bad idea, perhaps you haven't read them. (you can find them if you choose) Your criticism SCHED_INTERACTIVE leaves me dumbfounded, since you were, and still are, specifically telling me that I should tell the scheduler that X is special. I did precisely that, and am also trying to tell it that it's clients are special too, _without_ having to start each and every client at nice -10 or whatever static number of the day. > > here's some test results, comparing SD-latest to Mike's-latest: > > > > re-testing the weak points of the vanilla scheduler + Mike's: > > > > - thud.c: this workload has almost unnoticeable effect > > - fiftyp.c: noticeable, but alot better than previously! > > Load of 1.5 makes mainline a doorstop without throttling. Where does that come from? Doesn't jibe with my experience at all. > > re-testing the weak points of SD: > > > > - hackbench: still unusable under such type of high load - no improvement. > > Load of 160. Is proportional slowdown bad? > > > - make -j: still less interactive than Mike's - no improvement. > > Depends on how big your job number vs cpu is. The better the throttling gets > with mainline the better SD gets in this comparison. At equal fairness > mainline does not have the low latency interactivity SD has. So we should do 8ms slices too? I don't think that's necessary. > Nice -10 X with SD is a far better solution than an ever increasing complexity > state machine and a userspace-changing scheduling policy just for X. Half > decent graphics cards get good interactivity with SD even without renicing. SD does not retain interactivity under any appreciable load for one, and secondly, I'm getting interactivity that SD cannot even get close to without renicing, and without any patches - in mainline right now. (Speaking of low latency, how long can tasks forking off sleepers who overlap their wake times prevent an array switch with SD? Forever?) I posted numbers that demonstrate the improvement in fairness while maintaining interactivity, and I'm not finished. I've solved the multiple fiftyp.c thing Ingo noticed, and in fact, I had 10 copies running that I had forgotten to terminate while I was working, and I didn't even notice until I finished, and saw my top window. Patch to follow as soon as I test some more (that's what takes much time, not creating the diff. this isn't rocket science.) Maybe I'll succeed, maybe I won't. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Ten percent test 2007-04-05 11:54 ` [test] sched: SD-latest versus Mike's latest Ingo Molnar 2007-04-05 12:10 ` Mike Galbraith 2007-04-05 16:08 ` Con Kolivas @ 2007-04-06 1:03 ` Con Kolivas 2007-04-06 9:07 ` Mike Galbraith 2 siblings, 1 reply; 92+ messages in thread From: Con Kolivas @ 2007-04-06 1:03 UTC (permalink / raw) To: Ingo Molnar; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list [-- Attachment #1: Type: text/plain, Size: 599 bytes --] On Thursday 05 April 2007 21:54, Ingo Molnar wrote: > - fiftyp.c: noticeable, but alot better than previously! fiftyp.c seems to have been stumbled across by accident as having an effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I suggest a ten percent version like the following would be more useful as a test for the harmful effect discovered in fiftyp.c. (/me throws in obligatory code style change). Starts 15 processes that sleep ten times longer than they run. Change forks to 15 times the number of cpus you have and it should work on any size hardware. -- -ck [-- Attachment #2: tenp.c --] [-- Type: text/x-csrc, Size: 3784 bytes --] // gcc -O2 -o tenp tenp.c -lrt // code from interbench.c #include <stdio.h> #include <stdlib.h> #include <time.h> #include <unistd.h> #include <errno.h> #include <sys/types.h> /* * Start $forks processes that run for 10% cpu time each. Set this to * 15 * number of cpus for best effect. */ int forks = 15; unsigned long run_us = 1000000000, sleep_us; unsigned long loops_per_ms; void terminal_error(const char *name) { fprintf(stderr, "\n"); perror(name); exit (1); } unsigned long long get_nsecs(struct timespec *myts) { if (clock_gettime(CLOCK_REALTIME, myts)) terminal_error("clock_gettime"); return (myts->tv_sec * 1000000000 + myts->tv_nsec ); } void burn_loops(unsigned long loops) { unsigned long i; /* * We need some magic here to prevent the compiler from optimising * this loop away. Otherwise trying to emulate a fixed cpu load * with this loop will not work. */ for (i = 0 ; i < loops ; i++) asm volatile("" : : : "memory"); } /* Use this many usecs of cpu time */ void burn_usecs(unsigned long usecs) { unsigned long ms_loops; ms_loops = loops_per_ms / 1000 * usecs; burn_loops(ms_loops); } void microsleep(unsigned long long usecs) { struct timespec req, rem; rem.tv_sec = rem.tv_nsec = 0; req.tv_sec = usecs / 1000000; req.tv_nsec = (usecs - (req.tv_sec * 1000000)) * 1000; continue_sleep: if ((nanosleep(&req, &rem)) == -1) { if (errno == EINTR) { if (rem.tv_sec || rem.tv_nsec) { req.tv_sec = rem.tv_sec; req.tv_nsec = rem.tv_nsec; goto continue_sleep; } goto out; } terminal_error("nanosleep"); } out: return; } /* * In an unoptimised loop we try to benchmark how many meaningless loops * per second we can perform on this hardware to fairly accurately * reproduce certain percentage cpu usage */ void calibrate_loop(void) { unsigned long long start_time, loops_per_msec, run_time = 0, min_run_us = run_us; unsigned long loops; struct timespec myts; int i; printf("Calibrating loop\n"); loops_per_msec = 1000000; redo: /* Calibrate to within 1% accuracy */ while (run_time > 1010000 || run_time < 990000) { loops = loops_per_msec; start_time = get_nsecs(&myts); burn_loops(loops); run_time = get_nsecs(&myts) - start_time; loops_per_msec = (1000000 * loops_per_msec / run_time ? : loops_per_msec); } /* Rechecking after a pause increases reproducibility */ microsleep(1); loops = loops_per_msec; start_time = get_nsecs(&myts); burn_loops(loops); run_time = get_nsecs(&myts) - start_time; /* Tolerate 5% difference on checking */ if (run_time > 1050000 || run_time < 950000) goto redo; loops_per_ms=loops_per_msec; printf("Calibrating sleep interval\n"); microsleep(1); /* Find the smallest time interval close to 1ms that we can sleep */ for (i = 0; i < 100; i++) { start_time=get_nsecs(&myts); microsleep(1000); run_time=get_nsecs(&myts)-start_time; run_time /= 1000; if (run_time < run_us && run_us > 1000) run_us = run_time; } /* Then set run_us to that duration and sleep_us to 9 x that */ sleep_us = run_us * 9; printf("Calibrating run interval\n"); microsleep(1); /* Do a few runs to see what really gets us run_us runtime */ for (i = 0; i < 100; i++) { start_time=get_nsecs(&myts); burn_usecs(run_us); run_time=get_nsecs(&myts)-start_time; run_time /= 1000; if (run_time < min_run_us && run_time > run_us) min_run_us = run_time; } if (min_run_us < run_us) run_us = run_us * run_us / min_run_us; printf("Each fork will run for %lu usecs and sleep for %lu usecs\n", run_us, sleep_us); } int main(void){ int i; calibrate_loop(); printf("starting %d forks\n", forks); for(i = 1; i < forks; i++){ if(!fork()) break; } while(1){ burn_usecs(run_us); microsleep(sleep_us); } return 0; } ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-06 1:03 ` Ten percent test Con Kolivas @ 2007-04-06 9:07 ` Mike Galbraith 2007-04-06 9:28 ` Con Kolivas 0 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-06 9:07 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote: > On Thursday 05 April 2007 21:54, Ingo Molnar wrote: > > - fiftyp.c: noticeable, but alot better than previously! > > fiftyp.c seems to have been stumbled across by accident as having an effect > when Xenofon was trying to recreate Mike's 50% x 3 test case. I suggest a ten > percent version like the following would be more useful as a test for the > harmful effect discovered in fiftyp.c. (/me throws in obligatory code style > change). > > Starts 15 processes that sleep ten times longer than they run. Change forks to > 15 times the number of cpus you have and it should work on any size hardware. I was more focused on the general case, but all I should have to do to de-claw all of these sleep exploits is account rr time (only a couple of lines, done and building now). It's only a couple of lines. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-06 9:07 ` Mike Galbraith @ 2007-04-06 9:28 ` Con Kolivas 2007-04-06 10:03 ` Ingo Molnar 2007-04-06 10:48 ` Mike Galbraith 0 siblings, 2 replies; 92+ messages in thread From: Con Kolivas @ 2007-04-06 9:28 UTC (permalink / raw) To: Mike Galbraith; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Friday 06 April 2007 19:07, Mike Galbraith wrote: > On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote: > > On Thursday 05 April 2007 21:54, Ingo Molnar wrote: > > > - fiftyp.c: noticeable, but alot better than previously! > > > > fiftyp.c seems to have been stumbled across by accident as having an > > effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I > > suggest a ten percent version like the following would be more useful as > > a test for the harmful effect discovered in fiftyp.c. (/me throws in > > obligatory code style change). > > > > Starts 15 processes that sleep ten times longer than they run. Change > > forks to 15 times the number of cpus you have and it should work on any > > size hardware. > > I was more focused on the general case, but all I should have to do to > de-claw all of these sleep exploits is account rr time (only a couple of > lines, done and building now). It's only a couple of lines. The more you try to "de-claw" these sleep exploits the less effective you make your precious interactive estimator. Feel free to keep adding endless tweaks to undo the other tweaks in order to try and achieve what SD has by design. You'll end up with an incresingly complex state machine design of interactivity tweaks and interactivity throttlers all fighting each other to the point where the intearactivity estimator doesn't do anything. What's the point in that? Eventually you'll have an estimator throttled to the point it does nothing and you end up with something far less interactive than SD which is as interactive as fairness allows, unlike mainline. -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-06 9:28 ` Con Kolivas @ 2007-04-06 10:03 ` Ingo Molnar 2007-04-06 10:40 ` Mike Galbraith 2007-04-07 6:50 ` Con Kolivas 2007-04-06 10:48 ` Mike Galbraith 1 sibling, 2 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-06 10:03 UTC (permalink / raw) To: Con Kolivas; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list * Con Kolivas <kernel@kolivas.org> wrote: > > I was more focused on the general case, but all I should have to do > > to de-claw all of these sleep exploits is account rr time (only a > > couple of lines, done and building now). It's only a couple of > > lines. > > The more you try to "de-claw" these sleep exploits the less effective > you make your precious interactive estimator. Feel free to keep adding > endless tweaks to undo the other tweaks in order to try and achieve > what SD has by design. firstly, testing on various workloads Mike's tweaks work pretty well, while SD still doesnt handle the high-load case all that well. Note that it was you who raised this whole issue to begin with: everything was pretty quiet in scheduling interactivity land. (There was one person who reported wide-scale interactivity regressions against mainline but he didnt answer my followup posts to trace/debug the scenario.) SD has a built-in "interactivity estimator" as well, but hardcoded into its design. SD has its own set of ugly-looking tweaks as well - for example the prio_matrix. So it all comes down on 'what interactivity heuristics is enough', and which one is more tweakable. So far i've yet to see SD address the hackbench and make -j interactivity problems/regression for example, while Mike has been busy addressing the 'exploits' reported against mainline. > You'll end up with an incresingly complex state machine design of > interactivity tweaks and interactivity throttlers all fighting each > other to the point where the intearactivity estimator doesn't do > anything. [...] It comes down to defining interactivity by scheduling behavior, and making that definition flexible. SD's definition of interactivity is rigid (but it's still behavior-based, so not fundamentally different from an explicit 'interactivity estimator'), and currently it does not work well under high load. But ... i'm still entertaining the notion that it might be good enough, but you've got to demonstrate the design's flexibility. furthermore, your description does not match my experience when using Mike's tweaks and comparing it to SD on the same hardware. According to your claim i should have seen regressions popping up in various, already-fixed corners, but it didnt happen in practice. But ... i'm awaiting further SD and Mike tweaks, the race certainly looks interesting ;) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-06 10:03 ` Ingo Molnar @ 2007-04-06 10:40 ` Mike Galbraith 2007-04-07 6:50 ` Con Kolivas 1 sibling, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-06 10:40 UTC (permalink / raw) To: Ingo Molnar; +Cc: Con Kolivas, linux list, Andrew Morton, ck list On Fri, 2007-04-06 at 12:03 +0200, Ingo Molnar wrote: > already-fixed corners, but it didnt happen in practice. But ... i'm > awaiting further SD and Mike tweaks, the race certainly looks > interesting ;) <g> I think I lapped him, but since we're running in opposite directions, it's hard to tell. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-06 10:03 ` Ingo Molnar 2007-04-06 10:40 ` Mike Galbraith @ 2007-04-07 6:50 ` Con Kolivas 2007-04-07 16:12 ` Gene Heskett ` (2 more replies) 1 sibling, 3 replies; 92+ messages in thread From: Con Kolivas @ 2007-04-07 6:50 UTC (permalink / raw) To: Ingo Molnar; +Cc: Mike Galbraith, linux list, Andrew Morton, ck list On Friday 06 April 2007 20:03, Ingo Molnar wrote: > * Con Kolivas <kernel@kolivas.org> wrote: > > > I was more focused on the general case, but all I should have to do > > > to de-claw all of these sleep exploits is account rr time (only a > > > couple of lines, done and building now). It's only a couple of > > > lines. > > > > The more you try to "de-claw" these sleep exploits the less effective > > you make your precious interactive estimator. Feel free to keep adding > > endless tweaks to undo the other tweaks in order to try and achieve > > what SD has by design. > > firstly, testing on various workloads Mike's tweaks work pretty well, > while SD still doesnt handle the high-load case all that well. Note that > it was you who raised this whole issue to begin with: everything was > pretty quiet in scheduling interactivity land. I'm terribly sorry but you have completely missed my intentions then. I was _not_ trying to improve mainline's interactivity at all. My desire was to fix the unfairness that mainline has, across the board without compromising fairness. You said yourself that an approach that fixed a lot and had a small number of regressions would be worth it. In a surprisingly ironic turnaround two bizarre things happened. People found SD fixed a lot of their interactivity corner cases which were showstoppers. That didn't surprise me because any unfair design will by its nature get it wrong sometimes. The even _more_ surprising thing is that you're now using interactivity as the argument against SD. I did not set out to create better interactivity, I set out to create widespread fairness without too much compromise to interactivity. As I said from the _very first email_, there would be cases of interactivity in mainline that performed better. > (There was one person who > reported wide-scale interactivity regressions against mainline but he > didnt answer my followup posts to trace/debug the scenario.) That was one user. As I mentioned in an earlier thread, the problem with email threads on drawn out issues on lkml is that all that people remember is the last one creating noise, and that has only been the noise from Mike for 2 weeks now. Has everyone forgotten the many many users who reported the advantages first up which generated the interest in the first place? Why have they stopped reporting? Well the answer is obvious; all the signs suggest that SD is slated for mainline. It is on the path, Linus has suggested it and now akpm is asking if it's ready for 2.6.22. So they figure there is no point testing and replying any further. SD is ready for prime time, finalised and does everything I intended it to. This is where I have to reveal to them the horrible truth. This is no guarantee it will go in. In fact, this one point that you (Ingo) go on and on about is not only a quibble, but you will call it an absolute showstopper. As maintainer of the cpu scheduler, in its current form you will flatly refuse it goes to mainline citing the 5% of cases where interactivity has regressed. So people will tell me to fix it, right?... Read on for this to unfold. > SD has a built-in "interactivity estimator" as well, but hardcoded into > its design. SD has its own set of ugly-looking tweaks as well - for > example the prio_matrix. I'm sorry but this is a mis-representation to me, as I suggested on an earlier thread where I disagree about what an interactivity estimator is. The idea of fence posts in a clock that are passed as a way of metering out earliest-deadline-first in a design is well established. The matrix is simply an array designed for O(1) lookups of the fence posts. That is not the same as "oh how much have we slept in the last $magic_number period and how much extra time should we get for that". > So it all comes down on 'what interactivity > heuristics is enough', and which one is more tweakable. So far i've yet > to see SD address the hackbench and make -j interactivity > problems/regression for example, while Mike has been busy addressing the > 'exploits' reported against mainline. And BANG there is the bullet you will use against SD from here to eternity. SD obeys fairness at all costs. Your interactivity regression is that SD causes progressive slowdown with load which by definition is fairness. You repeatedly ask me to address it and there is on unfailing truth; the only way to address it is to add unfairness to the design. So why don't I? Because the simple fact is that any unfairness no matter how carefully administered or metered will always have cases where it's wrong. Look at the title of this email for example - it's yet another exploit for the mainline sleep/run mechanism. This does _not_ mean I'm implying people are logging into servers and running ./tenp to hang the machine. What it demonstrates is a way of reproducing the scenario which is biting people with real world loads. It's entirely believable that a simple p2p app could be behaving like tenp, only generating a small load and it could take ages to log in and use the console. Willy has complained this is why people stick to 2.4. Sure I can create interactivity tweaks worse than anyone else. I will not, though, because that precisely undoes what is special about SD. It never looks backwards, and is predictable to absurdity. So you'll argue that mainline can manage it below... > > You'll end up with an incresingly complex state machine design of > > interactivity tweaks and interactivity throttlers all fighting each > > other to the point where the intearactivity estimator doesn't do > > anything. [...] > > It comes down to defining interactivity by scheduling behavior, and > making that definition flexible. SD's definition of interactivity is > rigid (but it's still behavior-based, so not fundamentally different > from an explicit 'interactivity estimator'), and currently it does not > work well under high load. But ... i'm still entertaining the notion > that it might be good enough, but you've got to demonstrate the design's > flexibility. I have yet to see someone find an "exploit" for SD's current design. Mainline is all about continually patching up the intrinsic design (and fixing this one test case is not the be all and end all). > furthermore, your description does not match my experience when using > Mike's tweaks and comparing it to SD on the same hardware. According to > your claim i should have seen regressions popping up in various, > already-fixed corners, but it didnt happen in practice. But ... i'm > awaiting further SD and Mike tweaks, the race certainly looks > interesting ;) Well you see a race. I do not. I see a flat predictable performance from SD where there will always be slowdown with load. I have no intention of changing that. Mike is making an admirable attempt to fix issues as they are pointed out. You say there are no regressions but I see absolutely no testers of his patches besides himself and you. If I introduce any unfairness based on sleep behaviour into SD I'll be undoing the whole point of the design and end up chasing new regressions. So I won't quibble over the numbers. SD has produced a lot of improvements and fairness that mainline struggles with ever increasing patches to emulate, but SD does so at the expense of proportional slowdown with load. At least I accept that and will no longer put my health at risk trying to "fix" it by "breaking" it. SD is done. I feel sorry for the many users out there who are simply "waiting for it to end up in mainline" who just discovered you will veto it on that basis. lwn.net had it wrong; this was far more painful than any previous attempt to get anything into mainline. My health has been so badly affected by this that I've been given an ultimatum and must turn my computer off till I get well now which may be weeks. I already know the massive flameage and last-word comments that are likely to be fired off before the inevitable decision to veto it. > Ingo さようなら -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 6:50 ` Con Kolivas @ 2007-04-07 16:12 ` Gene Heskett 2007-04-07 18:08 ` Ingo Molnar 2007-04-07 16:32 ` Mike Galbraith 2007-04-08 13:08 ` Ed Tomlinson 2 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-07 16:12 UTC (permalink / raw) To: linux-kernel Cc: Con Kolivas, Ingo Molnar, Mike Galbraith, Andrew Morton, ck list On Saturday 07 April 2007, Con Kolivas wrote: >On Friday 06 April 2007 20:03, Ingo Molnar wrote: >> * Con Kolivas <kernel@kolivas.org> wrote: >[...] >> >> firstly, testing on various workloads Mike's tweaks work pretty well, >> while SD still doesnt handle the high-load case all that well. Note >> that it was you who raised this whole issue to begin with: everything >> was pretty quiet in scheduling interactivity land. Con was scratching an itch, one we desktop users all have in a place we can't quite reach to scratch because we aren't quite the coding gods we should be. Con at least has the coding knowledge to walk in and start shoveling, which is more than I can say of the efforts to derail the SD scheduler have demonstrated to this user. >I'm terribly sorry but you have completely missed my intentions then. I > was _not_ trying to improve mainline's interactivity at all. My desire > was to fix the unfairness that mainline has, across the board without > compromising fairness. You said yourself that an approach that fixed a > lot and had a small number of regressions would be worth it. In a > surprisingly ironic turnaround two bizarre things happened. People > found SD fixed a lot of their interactivity corner cases which were > showstoppers. That didn't surprise me because any unfair design will by > its nature get it wrong sometimes. The even _more_ surprising thing is > that you're now using interactivity as the argument against SD. I did > not set out to create better interactivity, I set out to create > widespread fairness without too much compromise to interactivity. As I > said from the _very first email_, there would be cases of interactivity > in mainline that performed better. > >> (There was one person who >> reported wide-scale interactivity regressions against mainline but he >> didnt answer my followup posts to trace/debug the scenario.) > >That was one user. As I mentioned in an earlier thread, the problem with > email threads on drawn out issues on lkml is that all that people > remember is the last one creating noise, and that has only been the > noise from Mike for 2 weeks now. Has everyone forgotten the many many > users who reported the advantages first up which generated the interest > in the first place? Why have they stopped reporting? Well the answer is > obvious; all the signs suggest that SD is slated for mainline. It is on > the path, Linus has suggested it and now akpm is asking if it's ready > for 2.6.22. So they figure there is no point testing and replying any > further. SD is ready for prime time, finalised and does everything I > intended it to. This is where I have to reveal to them the horrible > truth. This is no guarantee it will go in. In fact, this one point that > you (Ingo) go on and on about is not only a quibble, but you will call > it an absolute showstopper. As maintainer of the cpu scheduler, in its > current form you will flatly refuse it goes to mainline citing the 5% > of cases where interactivity has regressed. So people will tell me to > fix it, right?... Read on for this to unfold. Sorry, this user got quiet to watch the cat fight. Obviously I should have been throwing messages wrapped around rocks (or something). >> SD has a built-in "interactivity estimator" as well, but hardcoded >> into its design. SD has its own set of ugly-looking tweaks as well - >> for example the prio_matrix. > >I'm sorry but this is a mis-representation to me, as I suggested on an > earlier thread where I disagree about what an interactivity estimator > is. The idea of fence posts in a clock that are passed as a way of > metering out earliest-deadline-first in a design is well established. > The matrix is simply an array designed for O(1) lookups of the fence > posts. That is not the same as "oh how much have we slept in the last > $magic_number period and how much extra time should we get for that". > >> So it all comes down on 'what interactivity >> heuristics is enough', and which one is more tweakable. So far i've >> yet to see SD address the hackbench and make -j interactivity >> problems/regression for example, while Mike has been busy addressing >> the 'exploits' reported against mainline. Who gives a s*** about hackbench or a make -j 200?! Those are NOT, and NEVER WILL BE, REAL WORLD LOADS for the vast majority of us. For us SD Just Worked(TM). >And BANG there is the bullet you will use against SD from here to > eternity. SD obeys fairness at all costs. Your interactivity regression > is that SD causes progressive slowdown with load which by definition is > fairness. You repeatedly ask me to address it and there is on unfailing > truth; the only way to address it is to add unfairness to the design. > So why don't I? Because the simple fact is that any unfairness no > matter how carefully administered or metered will always have cases > where it's wrong. Look at the title of this email for example - it's > yet another exploit for the mainline sleep/run mechanism. This does > _not_ mean I'm implying people are logging into servers and running > ./tenp to hang the machine. What it demonstrates is a way of > reproducing the scenario which is biting people with real world loads. > It's entirely believable that a simple p2p app could be behaving like > tenp, only generating a small load and it could take ages to log in and > use the console. Willy has complained this is why people stick to 2.4. > Sure I can create interactivity tweaks worse than anyone else. I will > not, though, because that precisely undoes what is special about SD. It > never looks backwards, and is predictable to absurdity. So you'll argue > that mainline can manage it below... > >> > You'll end up with an incresingly complex state machine design of >> > interactivity tweaks and interactivity throttlers all fighting each >> > other to the point where the intearactivity estimator doesn't do >> > anything. [...] >> >> It comes down to defining interactivity by scheduling behavior, and >> making that definition flexible. SD's definition of interactivity is >> rigid (but it's still behavior-based, so not fundamentally different >> from an explicit 'interactivity estimator'), and currently it does not >> work well under high load. But ... i'm still entertaining the notion >> that it might be good enough, but you've got to demonstrate the >> design's flexibility. > >I have yet to see someone find an "exploit" for SD's current design. > Mainline is all about continually patching up the intrinsic design (and > fixing this one test case is not the be all and end all). > >> furthermore, your description does not match my experience when using >> Mike's tweaks and comparing it to SD on the same hardware. According >> to your claim i should have seen regressions popping up in various, >> already-fixed corners, but it didnt happen in practice. But ... i'm >> awaiting further SD and Mike tweaks, the race certainly looks >> interesting ;) > >Well you see a race. I do not. I see a flat predictable performance from > SD where there will always be slowdown with load. I have no intention > of changing that. Mike is making an admirable attempt to fix issues as > they are pointed out. You say there are no regressions but I see > absolutely no testers of his patches besides himself and you. If I > introduce any unfairness based on sleep behaviour into SD I'll be > undoing the whole point of the design and end up chasing new > regressions. So I won't quibble over the numbers. SD has produced a lot > of improvements and fairness that mainline struggles with ever > increasing patches to emulate, but SD does so at the expense of > proportional slowdown with load. To be expected, there are after all, only so many cpu cycles to go around. Here I sit, running 2.6.21-rc6 ATM, and since there is not an SD patch that applies cleanly to rc6, I am back to typing half or more of a sentence blind while I answer a posting such as this because of x starvation while kmail is sorting incoming stuff. All this while gkrellm, sitting on the right edge of my screen, is showing a 0 to 2% cpu usage in its graphic display! FWIW, also isn't suffering the same display update problems, nor is the system clock down on the kickstart bar. If that isn't prima faci evidence of an unfair scheduler, I don't know what is. With the SD patch applied to a working kernel, I've pretty well got my machine back and I'm in command again, just as if I was running nitros9 on my trs-80 Color Computer while it was compiling a program in the background, or back when I was doing all this on an amiga. Both of these had, by their simplistic designs, schedulers that were fair, with (nitr)os9 having the ability to schedule the order that IRQ's were serviced with a priority setting on a per IRQ basis. If Amigados ever had the ability to fiddle with the scheduler other than niceing the process, it wasn't important enough for me to see if I could tweak it because generally it simply worked. Con's earlier patches worked very well for this desktop user, but as Mike kept bitching about "production", (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world? Certainly not this user, who would like to think he has more sense than that) and to heck with the users experience. Con kept trying to satisfy Mike, so the more recent ones (that I was able to apply & test since I was also fighting with the device-mapper change) weren't quite the night and day difference the earlier ones were for a desktop user. I thought I made enough noise in favor of Con's approach early on, but you two got into what can only be described as a cat fight, with Ingo apparently siding with Mike, and Con apparently isn't up to that sort of thing either physically, or mentally after the seemingly endless criticism from Mike. And I hate to say it, but Ingo, you weren't a lot of help either, the setbatch utility you had me doing in my scripts didn't work all that well, returning an error which was its process number when incorporated into a script, and still left me without a machine for 5 seconds at a time when gzip -best was running in the background, all this something I didn't report all that noisily because of the unrelated amanda problems I was having post 2.6.20.3. The SD patches, generally speaking, brought this lag into the less than 1 second range 90% of the time. Maybe I wasn't able to give SD vs mainline aspect my full attention either, you'll recall that I was, and am, still bouncing around from the in-out status of the device-mapper patch. But, now that I know how to handle it (as a startup from square one for amanda) and the reason for the patch, I am more than willing to tolerate it if its a one time thing, and the amanda users have been advised, but now its been reverted for a bit, and will have to go through this particular bit of hassle again later. I disagree with the reversion of that patch on that basis alone. > At least I accept that and will no > longer put my health at risk trying to "fix" it by "breaking" it. SD is > done. > >I feel sorry for the many users out there who are simply "waiting for it > to end up in mainline" who just discovered you will veto it on that > basis. lwn.net had it wrong; this was far more painful than any > previous attempt to get anything into mainline. > >My health has been so badly affected by this that I've been given an > ultimatum and must turn my computer off till I get well now which may > be weeks. I already know the massive flameage and last-word comments > that are likely to be fired off before the inevitable decision to veto > it. In this regard, my sympathies Con. Do get well. And re-armed for this battle. It is a worthwhile battle, and many thanks from this user to you for having fought it. And I apologize for not supporting your efforts a lot more vocally. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) Is this going to involve RAW human ecstasy? ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 16:12 ` Gene Heskett @ 2007-04-07 18:08 ` Ingo Molnar 2007-04-07 18:23 ` Gene Heskett 2007-04-07 19:14 ` Mike Galbraith 0 siblings, 2 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-07 18:08 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list * Gene Heskett <gene.heskett@gmail.com> wrote: > To be expected, there are after all, only so many cpu cycles to go > around. Here I sit, running 2.6.21-rc6 ATM, and since there is not an > SD patch that applies cleanly to rc6, I am back to typing half or more > of a sentence blind while I answer a posting such as this because of x > starvation while kmail is sorting incoming stuff. it would be really nice to analyze this. Does the latest -rt patch boot on your box so that we could trace this regression? (I can send you a standalone tracing patch if it doesnt.) IIRC you reported that one of the early patches from Mike made your system behave good (but still not as good as SD) - it would be nice to try a later patch too. basically, the current unfairness in the scheduler should be solved, one way or another. Good testcases were posted and there's progress. > (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world? not many - and i dont think Mike tested any of these - Mike tested pretty low make -j values (Mike, can you confirm?). (I personally routinely run 'make -j 200' build jobs on my box [because it's the central server of a build cluster and high parallelism is needed to overcome network latencies], but i'm pretty special in that regard and i didnt use that workload as a test against any of these schedulers.) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 18:08 ` Ingo Molnar @ 2007-04-07 18:23 ` Gene Heskett 2007-04-07 18:52 ` Ingo Molnar 2007-04-07 19:14 ` Mike Galbraith 1 sibling, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-07 18:23 UTC (permalink / raw) To: linux-kernel Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list On Saturday 07 April 2007, Ingo Molnar wrote: >* Gene Heskett <gene.heskett@gmail.com> wrote: >> To be expected, there are after all, only so many cpu cycles to go >> around. Here I sit, running 2.6.21-rc6 ATM, and since there is not an >> SD patch that applies cleanly to rc6, I am back to typing half or more >> of a sentence blind while I answer a posting such as this because of x >> starvation while kmail is sorting incoming stuff. > >it would be really nice to analyze this. Does the latest -rt patch boot >on your box so that we could trace this regression? (I can send you a >standalone tracing patch if it doesnt.) IIRC you reported that one of >the early patches from Mike made your system behave good (but still not >as good as SD) - it would be nice to try a later patch too. Yes it would be Ingo, but so far, none of the recent -rt patches has booted on this machine, the last one I tried a few days ago failing to find /dev/root, whatever the heck that is. FWIW, I gave up on the rt stuffs 6 months or more ago when the regressions I was reporting weren't ever acknowledged. I don't enjoy sitting through all these e2fsk's during the reboot just to have things I normally run in the background die, like tvtime, sitting there with some news channel muttering along in the background. I was even ignored when I suggested it might be a dma problem, which I still think it could be. Nevertheless, the patch you sent is building as I type, intermittently when the screen deigns to update so I can fix the spelling etc. >basically, the current unfairness in the scheduler should be solved, one >way or another. Good testcases were posted and there's progress. > >> (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world? > >not many - and i dont think Mike tested any of these - Mike tested >pretty low make -j values (Mike, can you confirm?). > >(I personally routinely run 'make -j 200' build jobs on my box [because > it's the central server of a build cluster and high parallelism is > needed to overcome network latencies], but i'm pretty special in that > regard and i didnt use that workload as a test against any of these > schedulers.) And I'd wager a cool one that you don't gain more than a second or so in compile time between a make -j8 and a make -j200 unless your network is a pair of tomato juice cans & some string. Again, to me, the network thing is not something that's present in an everyday users environment. My drives are all here and now, on pata-133 interfaces. > Ingo -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) If you would keep a secret from an enemy, tell it not to a friend. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 18:23 ` Gene Heskett @ 2007-04-07 18:52 ` Ingo Molnar 2007-04-07 20:30 ` Gene Heskett 0 siblings, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-07 18:52 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list * Gene Heskett <gene.heskett@gmail.com> wrote: > Yes it would be Ingo, but so far, none of the recent -rt patches has > booted on this machine, the last one I tried a few days ago failing to > find /dev/root, whatever the heck that is. did you have a chance to try the yum kernel by any chance? The -testing one you can try on Fedora with little hassle, by doing this as root: cat > /etc/yum.repos.d/rt-testing.repo [rt-testing] name=Ingo's Real-Time (-rt) test-kernel for FC6 baseurl=http://people.redhat.com/mingo/realtime-preempt/yum-testing/yum/ enabled=1 gpgcheck=0 <Ctrl-D> and "yum install kernel-rt" and a reboot should get you going. > [...] I don't enjoy sitting through all these e2fsk's during the > reboot just to have things I normally run in the background die, like > tvtime, sitting there with some news channel muttering along in the > background. I was even ignored when I suggested it might be a dma > problem, which I still think it could be. i did spend quite some time to debug your tv-tuner problem back then, and for that purpose alone i bought a tv tuner card to test this myself. (but it worked on my testbox) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 18:52 ` Ingo Molnar @ 2007-04-07 20:30 ` Gene Heskett 2007-04-08 10:41 ` Ingo Molnar 0 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-07 20:30 UTC (permalink / raw) To: linux-kernel Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list On Saturday 07 April 2007, Ingo Molnar wrote: >* Gene Heskett <gene.heskett@gmail.com> wrote: >> Yes it would be Ingo, but so far, none of the recent -rt patches has >> booted on this machine, the last one I tried a few days ago failing to >> find /dev/root, whatever the heck that is. > >did you have a chance to try the yum kernel by any chance? The -testing >one you can try on Fedora with little hassle, by doing this as root: > >cat > /etc/yum.repos.d/rt-testing.repo >[rt-testing] >name=Ingo's Real-Time (-rt) test-kernel for FC6 >baseurl=http://people.redhat.com/mingo/realtime-preempt/yum-testing/yum/ >enabled=1 >gpgcheck=0 ><Ctrl-D> > >and "yum install kernel-rt" and a reboot should get you going. No, I couldn't seem to get that to show up in a yumex display, and I'm partial to smart anyway. >> [...] I don't enjoy sitting through all these e2fsk's during the >> reboot just to have things I normally run in the background die, like >> tvtime, sitting there with some news channel muttering along in the >> background. I was even ignored when I suggested it might be a dma >> problem, which I still think it could be. > >i did spend quite some time to debug your tv-tuner problem back then, >and for that purpose alone i bought a tv tuner card to test this myself. >(but it worked on my testbox) > > Ingo You didn't tell me this. That said, I am booted to the patch you sent me now, and this also is a very obvious improvement, one I could easily live with on a long term basis. I haven't tried a kernel build in the background yet, but I have sat here and played patience for about an hour, looking for the little stutters, but never saw them. So I could just as easily recommend this one for desktop use, it seems to be working. tvtime hasn't had any audio or video glitches that I've noted when I was on that screen to check on an interesting story, like the 102 year old lady who finally got her hole in one, on a very short hole, but after 90 years of golfing, she was beginning to wonder if she would ever get one. Not sure who bought at the 19th hole, HNN didn't cover that traditional part. So this patch also works. And if it gets into mainline, at least Con's efforts at proding the fixes needed will not have been in vain. My question then, is why did it take a very public cat-fight to get this looked at and the code adjusted? Its been what, nearly 2 years since Linus himself made a comment that this thing needed fixed. The fixes then done were of very little actual effectiveness and the situation then has gradually deteriorated since. Its on the desktop that linux will win or lose the public's market share. After all, there are only so many 'servers' on the planet, a market that linux has pretty well demo'ed its superiority, if not in terms of speed, at least in security. To qualify that, I currently have 2 of yahoo's machines in my .procmailrc's /dev/null list as they are a source of a large number of little 1 to 3 line spams. I assume they are IIS machines, but the emails headers aren't that explicit to my relatively untrained eyeballs. And I'd like to see korea put on a permanent rbl black hole. I'm less than amused at watching the log coming out of my router as first one shithead and then the next makes a 100,000 word dictionary attack against it. One has even found a way too cause a tcp reset about every 10 words tried. But nobody has gotten any farther than that. That knocking sound? Guess. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) You are magnetic in your bearing. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 20:30 ` Gene Heskett @ 2007-04-08 10:41 ` Ingo Molnar 2007-04-08 10:58 ` Ingo Molnar ` (2 more replies) 0 siblings, 3 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-08 10:41 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list * Gene Heskett <gene.heskett@gmail.com> wrote: > That said, I am booted to the patch you sent me now, and this also is > a very obvious improvement, one I could easily live with on a long > term basis. I haven't tried a kernel build in the background yet, but > I have sat here and played patience for about an hour, looking for the > little stutters, but never saw them. So I could just as easily > recommend this one for desktop use, it seems to be working. tvtime > hasn't had any audio or video glitches that I've noted when I was on > that screen to check on an interesting story, like the 102 year old > lady who finally got her hole in one, on a very short hole, but after > 90 years of golfing, she was beginning to wonder if she would ever get > one. Not sure who bought at the 19th hole, HNN didn't cover that > traditional part. > > So this patch also works. And if it gets into mainline, at least > Con's efforts at proding the fixes needed will not have been in vain. thanks for testing it! (for the record, Gene tested sched-mike-4.patch, which is Mike's patch from 4 days ago.) > My question then, is why did it take a very public cat-fight to get > this looked at and the code adjusted? Its been what, nearly 2 years > since Linus himself made a comment that this thing needed fixed. The > fixes then done were of very little actual effectiveness and the > situation then has gradually deteriorated since. this is pretty hard to get right, and the most objective way to change it is to do it testcase-driven. FYI, interactivity tweaking has been gradual, the last bigger round of interactivity changes were done a year ago: commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 Author: Mike Galbraith <efault@gmx.de> Date: Mon Apr 10 22:52:44 2006 -0700 [PATCH] sched: fix interactive task starvation (and a few smaller tweaks since then too.) and that change from Mike responded to a testcase. Mike's latest changes (the ones you just tested) were mostly driven by actual testcases too, which measured long-term timeslice distribution fairness. It's really hard to judge interactivity subjectively, so we rely on things like interbench (written by Con) - in which testsuite the upstream scheduler didnt fare all that badly, plus other testcases (thud.c, game_sim.c, now massive_inter.c, fiftyp.c and chew.c) and all the usual test-workloads. This is admittedly a slow process, but it seems to be working too and it also ensures that we dont regress in the future. (because testcases stick around and do get re-tested) your system seems to also be a bit special because you 1) drive it to the absolute max on the desktop but you do not overload it in obvious ways (i.e. your workloads are pretty fairly structured) 2) it's a bit under-powered (single-CPU 800 MHz CPU, right?) but not _too_ underpowered - so i think you /just/ managed to hit 'the worst' of the current interactivity estimator: with important tasks both being just above and just below 50%. Believe me, on all ~10 systems i use regularly, Linux interactivity of the vanilla scheduler is stellar. (And that includes a really old 500 MHz one too with FC6 on it.) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 10:41 ` Ingo Molnar @ 2007-04-08 10:58 ` Ingo Molnar 2007-04-08 17:04 ` Gene Heskett 2007-04-08 11:33 ` Gene Heskett 2007-04-08 18:51 ` Rene Herman 2 siblings, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-08 10:58 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list * Ingo Molnar <mingo@elte.hu> wrote: > > My question then, is why did it take a very public cat-fight to get > > this looked at and the code adjusted? Its been what, nearly 2 years > > since Linus himself made a comment that this thing needed fixed. > > The fixes then done were of very little actual effectiveness and the > > situation then has gradually deteriorated since. > > this is pretty hard to get right, and the most objective way to change > it is to do it testcase-driven. FYI, interactivity tweaking has been > gradual, the last bigger round of interactivity changes were done a > year ago: and note that a year ago Mike did a larger patch too, not unlike his current patch - but we hoped that his smaller change would be sufficient - and nobody came along and said "i tested Mike's and the difference is significant on my system". Which seems to suggest that the number of problem-systems and worried users/developers isnt particularly large. Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 10:58 ` Ingo Molnar @ 2007-04-08 17:04 ` Gene Heskett 2007-04-09 4:03 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-08 17:04 UTC (permalink / raw) To: linux-kernel Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list On Sunday 08 April 2007, Ingo Molnar wrote: >* Ingo Molnar <mingo@elte.hu> wrote: >> > My question then, is why did it take a very public cat-fight to get >> > this looked at and the code adjusted? Its been what, nearly 2 years >> > since Linus himself made a comment that this thing needed fixed. >> > The fixes then done were of very little actual effectiveness and the >> > situation then has gradually deteriorated since. >> >> this is pretty hard to get right, and the most objective way to change >> it is to do it testcase-driven. FYI, interactivity tweaking has been >> gradual, the last bigger round of interactivity changes were done a >> year ago: > >and note that a year ago Mike did a larger patch too, not unlike his >current patch - but we hoped that his smaller change would be sufficient >- and nobody came along and said "i tested Mike's and the difference is >significant on my system". May I suggest that while it may have been noticeable, it was not 'significant', so we didn't sing praises and bow to mecca at the time. I just thought that this is the way it was, till Cons patch proved otherwise for this 'desktop' user. We were then, and still are, looking for the magic that lets it all load up and slow down in a linear feeling fashion. Only those IRQ's that are fleeting and need serviced NOW should be exceptions to that rule. AFAIAC, gzip can take its turn in the queue, getting no more time in proportion than any other process that wakes up in its slice and finds it has something to do, if nothing to do it should yield the floor immediately, and in any event be put back at the far end of the queue when its timeslice is over. gzip in particular seems very reticent to give up the cpu at what should be the end of its timeslice. As it is, the IRQ's are being serviced, so no keystrokes are being lost, or very few, unlike the situation 2 years ago when whole sentences typed blind were on the missing list when x finally did get a chance to play catchup. As a desktop user, I fail to understand any good reason why a keystroke typed can't be echoed to the screen within 200 milliseconds regardless of how many gzip -best's amdump may be running in the background. I have a coco3, running nitros9 at a cpu clock rate of 1.79mhz with a 1/10th second context switch, in the basement that CAN do that while assembling an executable with a separate process printing the listing of that assembly as it progresses. Why can't linux? >Which seems to suggest that the number of >problem-systems and worried users/developers isnt particularly large. Again, may I suggest that this sort of behavior on the desktop is a contributing factor to that relative scarcity? > Ingo -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) The meek will inherit the earth -- if that's OK with you. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 17:04 ` Gene Heskett @ 2007-04-09 4:03 ` Mike Galbraith 2007-04-09 4:08 ` Gene Heskett 0 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 4:03 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sun, 2007-04-08 at 13:04 -0400, Gene Heskett wrote: > On Sunday 08 April 2007, Ingo Molnar wrote: > >and note that a year ago Mike did a larger patch too, not unlike his > >current patch - but we hoped that his smaller change would be sufficient > >- and nobody came along and said "i tested Mike's and the difference is > >significant on my system". > > May I suggest that while it may have been noticeable, it was > not 'significant', so we didn't sing praises and bow to mecca at the > time. Actually, there was practically nil interest in testing. We made a couple of minor adjustments to the interactivity logic, and all went quiet, so I didn't think it was enough of a problem to require more intrusive countermeasures. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 4:03 ` Mike Galbraith @ 2007-04-09 4:08 ` Gene Heskett 2007-04-09 5:59 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-09 4:08 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Monday 09 April 2007, Mike Galbraith wrote: >On Sun, 2007-04-08 at 13:04 -0400, Gene Heskett wrote: >> On Sunday 08 April 2007, Ingo Molnar wrote: >> >and note that a year ago Mike did a larger patch too, not unlike his >> >current patch - but we hoped that his smaller change would be >> > sufficient - and nobody came along and said "i tested Mike's and the >> > difference is significant on my system". >> >> May I suggest that while it may have been noticeable, it was >> not 'significant', so we didn't sing praises and bow to mecca at the >> time. > >Actually, there was practically nil interest in testing. We made a >couple of minor adjustments to the interactivity logic, and all went >quiet, so I didn't think it was enough of a problem to require more >intrusive countermeasures. > > -Mike Does one of these messages have a url so I can test the latest of your patches for -rc6? Or was the one Ingo sent the most recent? Putting that url in your sig would be nice, and might result in its getting a lot more exersize which should = more feedback. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) Got a complaint about the Internal Revenue Service? Call the convenient toll-free "IRS Taxpayer Complaint Hot Line Number": 1-800-AUDITME ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 4:08 ` Gene Heskett @ 2007-04-09 5:59 ` Mike Galbraith 2007-04-09 13:01 ` Gene Heskett 0 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 5:59 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Mon, 2007-04-09 at 00:08 -0400, Gene Heskett wrote: > On Monday 09 April 2007, Mike Galbraith wrote: > > > > > >Actually, there was practically nil interest in testing. We made a > >couple of minor adjustments to the interactivity logic, and all went > >quiet, so I didn't think it was enough of a problem to require more > >intrusive countermeasures. > > > > -Mike > > Does one of these messages have a url so I can test the latest of your > patches for -rc6? Or was the one Ingo sent the most recent? No, my tree has a bugfix and some other adjustments that try to move the balance closer to fair without sacrificing interactivity. > Putting that url in your sig would be nice, and might result in its > getting a lot more exersize which should = more feedback. When I get it cleaned up and better tested, I'll post again. If you want, I'll CC you... willing victims are a highly valued commodity :) -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 5:59 ` Mike Galbraith @ 2007-04-09 13:01 ` Gene Heskett 0 siblings, 0 replies; 92+ messages in thread From: Gene Heskett @ 2007-04-09 13:01 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Monday 09 April 2007, Mike Galbraith wrote: >On Mon, 2007-04-09 at 00:08 -0400, Gene Heskett wrote: >> On Monday 09 April 2007, Mike Galbraith wrote: >> >Actually, there was practically nil interest in testing. We made a >> >couple of minor adjustments to the interactivity logic, and all went >> >quiet, so I didn't think it was enough of a problem to require more >> >intrusive countermeasures. >> > >> > -Mike >> >> Does one of these messages have a url so I can test the latest of your >> patches for -rc6? Or was the one Ingo sent the most recent? > >No, my tree has a bugfix and some other adjustments that try to move the >balance closer to fair without sacrificing interactivity. > >> Putting that url in your sig would be nice, and might result in its >> getting a lot more exersize which should = more feedback. > >When I get it cleaned up and better tested, I'll post again. If you >want, I'll CC you... willing victims are a highly valued commodity :) > > -Mike :) Put me on that list, Mike. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) The box said "Requires Windows 95 or better." I can't understand why it won't work on my Linux computer. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 10:41 ` Ingo Molnar 2007-04-08 10:58 ` Ingo Molnar @ 2007-04-08 11:33 ` Gene Heskett 2007-04-08 11:40 ` Mike Galbraith 2007-04-08 18:51 ` Rene Herman 2 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-08 11:33 UTC (permalink / raw) To: linux-kernel Cc: Ingo Molnar, Con Kolivas, Mike Galbraith, Andrew Morton, ck list On Sunday 08 April 2007, Ingo Molnar wrote: >* Gene Heskett <gene.heskett@gmail.com> wrote: >> That said, I am booted to the patch you sent me now, and this also is >> a very obvious improvement, one I could easily live with on a long >> term basis. I haven't tried a kernel build in the background yet, but >> I have sat here and played patience for about an hour, looking for the >> little stutters, but never saw them. So I could just as easily >> recommend this one for desktop use, it seems to be working. tvtime >> hasn't had any audio or video glitches that I've noted when I was on >> that screen to check on an interesting story, like the 102 year old >> lady who finally got her hole in one, on a very short hole, but after >> 90 years of golfing, she was beginning to wonder if she would ever get >> one. Not sure who bought at the 19th hole, HNN didn't cover that >> traditional part. >> >> So this patch also works. And if it gets into mainline, at least >> Con's efforts at proding the fixes needed will not have been in vain. > >thanks for testing it! (for the record, Gene tested sched-mike-4.patch, >which is Mike's patch from 4 days ago.) > >> My question then, is why did it take a very public cat-fight to get >> this looked at and the code adjusted? Its been what, nearly 2 years >> since Linus himself made a comment that this thing needed fixed. The >> fixes then done were of very little actual effectiveness and the >> situation then has gradually deteriorated since. > >this is pretty hard to get right, and the most objective way to change >it is to do it testcase-driven. FYI, interactivity tweaking has been >gradual, the last bigger round of interactivity changes were done a year >ago: > > commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 > Author: Mike Galbraith <efault@gmx.de> > Date: Mon Apr 10 22:52:44 2006 -0700 > > [PATCH] sched: fix interactive task starvation > >(and a few smaller tweaks since then too.) > >and that change from Mike responded to a testcase. Mike's latest changes >(the ones you just tested) were mostly driven by actual testcases too, >which measured long-term timeslice distribution fairness. > >It's really hard to judge interactivity subjectively, so we rely on >things like interbench (written by Con) - in which testsuite the >upstream scheduler didnt fare all that badly, plus other testcases >(thud.c, game_sim.c, now massive_inter.c, fiftyp.c and chew.c) and all >the usual test-workloads. This is admittedly a slow process, but it >seems to be working too and it also ensures that we dont regress in the >future. (because testcases stick around and do get re-tested) > >your system seems to also be a bit special because you 1) drive it to >the absolute max on the desktop but you do not overload it in obvious >ways (i.e. your workloads are pretty fairly structured) 2) it's a bit >under-powered (single-CPU 800 MHz CPU, right?) but not _too_ >underpowered - so i think you /just/ managed to hit 'the worst' of the >current interactivity estimator: with important tasks both being just >above and just below 50%. Believe me, on all ~10 systems i use >regularly, Linux interactivity of the vanilla scheduler is stellar. (And >that includes a really old 500 MHz one too with FC6 on it.) Actually, its an XP2800 Athlon, 333 fsb, gig of memory. And I was all enthusiastic about this until amanda's nightly run started, at which point I started losing control for quite long periods, 30+ seconds at a time. Up till then I thought we had it made. In this regard, Cons patches were enough better to notice it right away, lags were 1-2 seconds max. That seems to be the killer loading here, building a kernel (make -j3) doesn't seem to lag it all that bad. One session of gzip -best makes it fall plumb over though, which was a disappointment. But, I could live with this. Now if I could figure out a way to nail dm_mod down to a fixed LANANA approved address, I just got bit again, because enabling pktcdvd caused a MAJOR switch, only from 253 to 252 but tar thinks the whole 45GB is all new again. So since it, dm_mod, no longer carries the experimental label, lets put that patch back in and be done with this particular hassle once and for all. If I had known that using LVM2 was going to be such a pain in the ass just with this item alone, I wouldn't have touched it with a 50 foot fiberglass pole. Or does this SOB effect normal partition mountings too? I don't know, and the suggested fixes from David Dillow I put in /etc/modprobe.conf are ignored for dm_mod, and when extended to pktcdvd, cause pktcdvd to fail totally. Mmm??, can I pass an 'option dm_mod major=238' as a kernel argument & make it work that way? This is extremely frustrating as it is now. > Ingo -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) Real Programmers don't write in PL/I. PL/I is for programmers who can't decide whether to write in COBOL or FORTRAN. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 11:33 ` Gene Heskett @ 2007-04-08 11:40 ` Mike Galbraith 2007-04-08 12:02 ` Mike Galbraith 2007-04-08 17:56 ` Gene Heskett 0 siblings, 2 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-08 11:40 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: > That seems to be the killer loading here, building a kernel (make -j3) > doesn't seem to lag it all that bad. One session of gzip -best makes it > fall plumb over though, which was a disappointment. Can you make a testcase that doesn't require amanda? -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 11:40 ` Mike Galbraith @ 2007-04-08 12:02 ` Mike Galbraith 2007-04-08 17:57 ` Gene Heskett 2007-04-08 17:56 ` Gene Heskett 1 sibling, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-08 12:02 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote: > On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: > > > That seems to be the killer loading here, building a kernel (make -j3) > > doesn't seem to lag it all that bad. One session of gzip -best makes it > > fall plumb over though, which was a disappointment. > > Can you make a testcase that doesn't require amanda? Or at least send me a couple of 5 or 10 second top snapshots (which also show CPU usage of sleeping tasks) while the system is misbehaving? -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 12:02 ` Mike Galbraith @ 2007-04-08 17:57 ` Gene Heskett 2007-04-09 4:19 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-08 17:57 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sunday 08 April 2007, Mike Galbraith wrote: >On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote: >> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: >> > That seems to be the killer loading here, building a kernel (make >> > -j3) doesn't seem to lag it all that bad. One session of gzip -best >> > makes it fall plumb over though, which was a disappointment. >> >> Can you make a testcase that doesn't require amanda? > >Or at least send me a couple of 5 or 10 second top snapshots (which also >show CPU usage of sleeping tasks) while the system is misbehaving? > > -Mike With what monitor utility? -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) "Microsoft technology" -- isn't that an oxymoron? -- Gareth Barnard ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 17:57 ` Gene Heskett @ 2007-04-09 4:19 ` Mike Galbraith 2007-04-09 5:23 ` Gene Heskett 0 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 4:19 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sun, 2007-04-08 at 13:57 -0400, Gene Heskett wrote: > On Sunday 08 April 2007, Mike Galbraith wrote: > >On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote: > >> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: > >> > That seems to be the killer loading here, building a kernel (make > >> > -j3) doesn't seem to lag it all that bad. One session of gzip -best > >> > makes it fall plumb over though, which was a disappointment. > >> > >> Can you make a testcase that doesn't require amanda? > > > >Or at least send me a couple of 5 or 10 second top snapshots (which also > >show CPU usage of sleeping tasks) while the system is misbehaving? > > > > -Mike > > With what monitor utility? Top. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 4:19 ` Mike Galbraith @ 2007-04-09 5:23 ` Gene Heskett 2007-04-09 6:09 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-09 5:23 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Monday 09 April 2007, Mike Galbraith wrote: >On Sun, 2007-04-08 at 13:57 -0400, Gene Heskett wrote: >> On Sunday 08 April 2007, Mike Galbraith wrote: >> >On Sun, 2007-04-08 at 13:40 +0200, Mike Galbraith wrote: >> >> On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: >> >> > That seems to be the killer loading here, building a kernel (make >> >> > -j3) doesn't seem to lag it all that bad. One session of gzip >> >> > -best makes it fall plumb over though, which was a >> >> > disappointment. >> >> >> >> Can you make a testcase that doesn't require amanda? >> > >> >Or at least send me a couple of 5 or 10 second top snapshots (which >> > also show CPU usage of sleeping tasks) while the system is >> > misbehaving? >> > >> > -Mike >> >> With what monitor utility? > >Top. > > -Mike This may not be so informative, its almost behaving ATM. 29252 amanda 22 0 1856 572 220 R 76.4 0.1 1:07.24 gzip 29235 amanda 15 0 2992 1224 888 S 5.6 0.1 0:02.80 chunker 29500 root 18 0 2996 1164 788 S 4.0 0.1 0:02.40 tar 10459 amanda 15 0 3340 1052 832 S 3.0 0.1 0:49.04 amandad 10536 amanda 15 0 3276 1308 1004 S 2.3 0.1 0:40.92 dumper 29496 amanda 18 0 2808 472 280 S 2.0 0.0 0:01.73 sendbackup 4057 gkrellmd 15 0 11568 1172 896 S 1.3 0.1 7:45.82 gkrellmd 29498 amanda 18 0 2396 780 656 S 1.0 0.1 0:00.60 tar 19183 root 15 0 0 0 0 S 0.7 0.0 0:01.92 pdflush I also note with some disdain that I'm half a megabyte into swap, but I've had FF-2.0.0.3 busy for the last hour while amanda was trying to find a few cycles at the same time. Looking at a bunch of pdf's of circuit boards to see if I wanna build them for my milling machine. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) Fatal Error: Found MS-Windows System -> Repartitioning Disk for Linux... ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 5:23 ` Gene Heskett @ 2007-04-09 6:09 ` Mike Galbraith 0 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 6:09 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Mon, 2007-04-09 at 01:23 -0400, Gene Heskett wrote: > This may not be so informative, its almost behaving ATM. > > 29252 amanda 22 0 1856 572 220 R 76.4 0.1 1:07.24 gzip > 29235 amanda 15 0 2992 1224 888 S 5.6 0.1 0:02.80 chunker > 29500 root 18 0 2996 1164 788 S 4.0 0.1 0:02.40 tar > 10459 amanda 15 0 3340 1052 832 S 3.0 0.1 0:49.04 amandad > 10536 amanda 15 0 3276 1308 1004 S 2.3 0.1 0:40.92 dumper > 29496 amanda 18 0 2808 472 280 S 2.0 0.0 0:01.73 sendbackup > 4057 gkrellmd 15 0 11568 1172 896 S 1.3 0.1 7:45.82 gkrellmd > 29498 amanda 18 0 2396 780 656 S 1.0 0.1 0:00.60 tar > 19183 root 15 0 0 0 0 S 0.7 0.0 0:01.92 pdflush > Yeah, this is showing the scheduler behaving properly. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 11:40 ` Mike Galbraith 2007-04-08 12:02 ` Mike Galbraith @ 2007-04-08 17:56 ` Gene Heskett 2007-04-09 4:17 ` Mike Galbraith 1 sibling, 1 reply; 92+ messages in thread From: Gene Heskett @ 2007-04-08 17:56 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sunday 08 April 2007, Mike Galbraith wrote: >On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: >> That seems to be the killer loading here, building a kernel (make -j3) >> doesn't seem to lag it all that bad. One session of gzip -best makes >> it fall plumb over though, which was a disappointment. > >Can you make a testcase that doesn't require amanda? > > -Mike Sure. Try 'tar czf nameofarchive.tar.gz /path/to-dir-to-be-backed-up' Or, from the runtar log from this morning, and this is all one line: runtar.20070408022016.debug:running: /bin/tar: 'gtar' '--create' '--file' '-' '--directory' '/usr/dlds-rpms' '--one-file-system' '--listed-incremental' '/usr/local/var/amanda/gnutar-lists/coyote_usr_dlds-rpms_1.new' '--sparse' '--ignore-failed-read' '--totals' '--exclude-from' '/tmp/amanda/sendbackup._usr_dlds-rpms.20070408022016.exclude' '.' and amanda will if requested, pipe that output through a |gzip -best, and its this process that brings the machine to the table begging for scraps like a puppy. Tar by itself can be felt but isn't bad. Even without the -best switch in effect, I'm sure you'll see the machine slow considerably. Please don't try to call amanda an unusual load as amanda itself is nothing but an intelligent manager, constructing the command lines passed to tar or dump, and gzip, which do the real work. Amdump, the manager my scripts wrap around, and my scripts themselves, will not use more than .01% of the cpu when averaged over the whole backup session. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) We are Microsoft. What you are experiencing is not a problem; it is an undocumented feature. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 17:56 ` Gene Heskett @ 2007-04-09 4:17 ` Mike Galbraith 2007-04-09 5:16 ` Gene Heskett 0 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 4:17 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Sun, 2007-04-08 at 13:56 -0400, Gene Heskett wrote: > On Sunday 08 April 2007, Mike Galbraith wrote: > >On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: > >> That seems to be the killer loading here, building a kernel (make -j3) > >> doesn't seem to lag it all that bad. One session of gzip -best makes > >> it fall plumb over though, which was a disappointment. > > > >Can you make a testcase that doesn't require amanda? > > > > -Mike > > Sure. Try 'tar czf nameofarchive.tar.gz /path/to-dir-to-be-backed-up' > > Or, from the runtar log from this morning, and this is all one line: > > runtar.20070408022016.debug:running: /bin/tar: 'gtar' '--create' '--file' '-' '--directory' '/usr/dlds-rpms' '--one-file-system' '--listed-incremental' '/usr/local/var/amanda/gnutar-lists/coyote_usr_dlds-rpms_1.new' '--sparse' '--ignore-failed-read' '--totals' '--exclude-from' '/tmp/amanda/sendbackup._usr_dlds-rpms.20070408022016.exclude' '.' > > and amanda will if requested, pipe that output through a |gzip -best, and > its this process that brings the machine to the table begging for scraps > like a puppy. Tar by itself can be felt but isn't bad. So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the problem? -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 4:17 ` Mike Galbraith @ 2007-04-09 5:16 ` Gene Heskett 2007-04-09 6:06 ` Mike Galbraith 2007-04-09 8:24 ` Mike Galbraith 0 siblings, 2 replies; 92+ messages in thread From: Gene Heskett @ 2007-04-09 5:16 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Monday 09 April 2007, Mike Galbraith wrote: >On Sun, 2007-04-08 at 13:56 -0400, Gene Heskett wrote: >> On Sunday 08 April 2007, Mike Galbraith wrote: >> >On Sun, 2007-04-08 at 07:33 -0400, Gene Heskett wrote: >> >> That seems to be the killer loading here, building a kernel (make >> >> -j3) doesn't seem to lag it all that bad. One session of gzip >> >> -best makes it fall plumb over though, which was a disappointment. >> > >> >Can you make a testcase that doesn't require amanda? >> > >> > -Mike >> >> Sure. Try 'tar czf nameofarchive.tar.gz /path/to-dir-to-be-backed-up' >> >> Or, from the runtar log from this morning, and this is all one line: >> >> runtar.20070408022016.debug:running: /bin/tar: 'gtar' '--create' >> '--file' '-' '--directory' '/usr/dlds-rpms' '--one-file-system' >> '--listed-incremental' >> '/usr/local/var/amanda/gnutar-lists/coyote_usr_dlds-rpms_1.new' >> '--sparse' '--ignore-failed-read' '--totals' '--exclude-from' >> '/tmp/amanda/sendbackup._usr_dlds-rpms.20070408022016.exclude' '.' >> >> and amanda will if requested, pipe that output through a |gzip -best, >> and its this process that brings the machine to the table begging for >> scraps like a puppy. Tar by itself can be felt but isn't bad. > >So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the >problem? > > -Mike That looks as if it should demo it pretty well if I understand correctly everything you're doing there. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) In /users3 did Kubla Kahn A stately pleasure dome decree, Where /bin, the sacred river ran Through Test Suites measureless to Man Down to a sunless C. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 5:16 ` Gene Heskett @ 2007-04-09 6:06 ` Mike Galbraith 2007-04-09 8:24 ` Mike Galbraith 1 sibling, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 6:06 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Mon, 2007-04-09 at 01:16 -0400, Gene Heskett wrote: > On Monday 09 April 2007, Mike Galbraith wrote: > >So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the > >problem? > > > > -Mike > > That looks as if it should demo it pretty well if I understand correctly > everything you're doing there. Well, I let it process my ~250GB of data with my current tree, and it looked utterly harmless (and since I'm running SMP, was of course). I'll try building UP to make sure, and check mainline as well. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 5:16 ` Gene Heskett 2007-04-09 6:06 ` Mike Galbraith @ 2007-04-09 8:24 ` Mike Galbraith 1 sibling, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 8:24 UTC (permalink / raw) To: Gene Heskett Cc: linux-kernel, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Mon, 2007-04-09 at 01:16 -0400, Gene Heskett wrote: > On Monday 09 April 2007, Mike Galbraith wrote: > > > >So tar -cvf - / | gzip --best | tar -tvzf - should reproduce the > >problem? > > > That looks as if it should demo it pretty well if I understand correctly > everything you're doing there. Ok, I can't reproduce any bad interactivity here with that workload either with SMP or UP kernel. That said however, gzip does attain interactive status, which it really should not - that gives it an unfair advantage over it's peers. With my throttled tree, it gets pushed back down to where it belongs. I'm going to try to tighten the tolerance on behavior to evict the riffraff who don't really belong in the elite interactive club sooner, and guarantee that even fast/light tasks can't dominate the CPU without paying heavily. (to close the many fast/light tasks wakeup scenario that the "untested" patch someone mentioned did, but was shown to be too painful to bare). -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 10:41 ` Ingo Molnar 2007-04-08 10:58 ` Ingo Molnar 2007-04-08 11:33 ` Gene Heskett @ 2007-04-08 18:51 ` Rene Herman 2007-04-09 4:23 ` Mike Galbraith 2007-04-09 13:53 ` Ingo Molnar 2 siblings, 2 replies; 92+ messages in thread From: Rene Herman @ 2007-04-08 18:51 UTC (permalink / raw) To: Ingo Molnar Cc: Gene Heskett, linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list On 04/08/2007 12:41 PM, Ingo Molnar wrote: > this is pretty hard to get right, and the most objective way to change > it is to do it testcase-driven. FYI, interactivity tweaking has been > gradual, the last bigger round of interactivity changes were done a year > ago: > > commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 > Author: Mike Galbraith <efault@gmx.de> > Date: Mon Apr 10 22:52:44 2006 -0700 > > [PATCH] sched: fix interactive task starvation > > (and a few smaller tweaks since then too.) > > and that change from Mike responded to a testcase. Mike's latest changes > (the ones you just tested) were mostly driven by actual testcases too, > which measured long-term timeslice distribution fairness. Ah yes, that one. Here's the next one in that series: commit f1adad78dd2fc8edaa513e0bde92b4c64340245c Author: Linus Torvalds <torvalds@g5.osdl.org> Date: Sun May 21 18:54:09 2006 -0700 Revert "[PATCH] sched: fix interactive task starvation" It personally had me wonder if _anyone_ was testing this stuff... Rene. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 18:51 ` Rene Herman @ 2007-04-09 4:23 ` Mike Galbraith 2007-04-09 12:14 ` Rene Herman 2007-04-09 13:53 ` Ingo Molnar 1 sibling, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 4:23 UTC (permalink / raw) To: Rene Herman Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list On Sun, 2007-04-08 at 20:51 +0200, Rene Herman wrote: > On 04/08/2007 12:41 PM, Ingo Molnar wrote: > > > this is pretty hard to get right, and the most objective way to change > > it is to do it testcase-driven. FYI, interactivity tweaking has been > > gradual, the last bigger round of interactivity changes were done a year > > ago: > > > > commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 > > Author: Mike Galbraith <efault@gmx.de> > > Date: Mon Apr 10 22:52:44 2006 -0700 > > > > [PATCH] sched: fix interactive task starvation > > > > (and a few smaller tweaks since then too.) > > > > and that change from Mike responded to a testcase. Mike's latest changes > > (the ones you just tested) were mostly driven by actual testcases too, > > which measured long-term timeslice distribution fairness. > > Ah yes, that one. Here's the next one in that series: > > commit f1adad78dd2fc8edaa513e0bde92b4c64340245c > Author: Linus Torvalds <torvalds@g5.osdl.org> > Date: Sun May 21 18:54:09 2006 -0700 > > Revert "[PATCH] sched: fix interactive task starvation" > > It personally had me wonder if _anyone_ was testing this stuff... Well of course not. Making random untested changes, and reverting them later is half the fun of kernel development. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 4:23 ` Mike Galbraith @ 2007-04-09 12:14 ` Rene Herman 2007-04-09 13:27 ` Andreas Mohr ` (2 more replies) 0 siblings, 3 replies; 92+ messages in thread From: Rene Herman @ 2007-04-09 12:14 UTC (permalink / raw) To: Mike Galbraith Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list On 04/09/2007 06:23 AM, Mike Galbraith wrote: > On Sun, 2007-04-08 at 20:51 +0200, Rene Herman wrote: >> On 04/08/2007 12:41 PM, Ingo Molnar wrote: >>> commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 >>> Author: Mike Galbraith <efault@gmx.de> >>> Date: Mon Apr 10 22:52:44 2006 -0700 >>> >>> [PATCH] sched: fix interactive task starvation >>> >>> (and a few smaller tweaks since then too.) >>> >>> and that change from Mike responded to a testcase. Mike's latest changes >>> (the ones you just tested) were mostly driven by actual testcases too, >>> which measured long-term timeslice distribution fairness. >> >> Ah yes, that one. Here's the next one in that series: >> >> commit f1adad78dd2fc8edaa513e0bde92b4c64340245c >> Author: Linus Torvalds <torvalds@g5.osdl.org> >> Date: Sun May 21 18:54:09 2006 -0700 >> >> Revert "[PATCH] sched: fix interactive task starvation" >> >> It personally had me wonder if _anyone_ was testing this stuff... > > Well of course not. Making random untested changes, and reverting > them later is half the fun of kernel development. The point ofcourse is that the very example Molnar quoted as an example of responsible, testcase driven development was in fact hugely broken and sat in the tree that way for 4 rc's. To me, the example rather serves as confirmation of what Kolivas has been saying; endlessly tweaking the tweaks isn't going anywhere. The minute you tweak A, tweak B over there in corner C-Sharp falls flat on its face. Computers are horribly stupid and tend to fail most situations their smart human programmers didn't specifically tell them about. If, as in the case of a scheduler, the real-world demands on a piece of software are so diverse that you cannot tell them about all possible situations specifically, the only workable solution is to make them _predictable_ so that when hitting one of those special situations, the smart human using the computer at least gets to know how to intervene if he feels inclined to do so. This turned into an interactivity thing, and while interactivity is in fact better for a large majority of testers, that isn't what Kolivas' scheduler is about. It's about predictability and leaving the dead-end road of these endlesss tweaks, which then break previous tweaks, rinse, repeat. It's unfortunate that Kolivas is having health problems currently, but I certainly do hope that his scheduler finds its way into _a_ -rc1. He said it was done... Rene. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 12:14 ` Rene Herman @ 2007-04-09 13:27 ` Andreas Mohr 2007-04-09 19:54 ` Rene Herman 2007-04-09 14:15 ` Ingo Molnar 2007-04-09 17:10 ` Mike Galbraith 2 siblings, 1 reply; 92+ messages in thread From: Andreas Mohr @ 2007-04-09 13:27 UTC (permalink / raw) To: Rene Herman Cc: Mike Galbraith, Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list Hi, On Mon, Apr 09, 2007 at 02:14:49PM +0200, Rene Herman wrote: > This turned into an interactivity thing, and while interactivity is in > fact better for a large majority of testers, that isn't what Kolivas' > scheduler is about. It's about predictability and leaving the dead-end > road of these endlesss tweaks, which then break previous tweaks, rinse, > repeat. > > It's unfortunate that Kolivas is having health problems currently, but I > certainly do hope that his scheduler finds its way into _a_ -rc1. He > said it was done... The whole recent discussion/flamefest/... here makes me think that we're still heading towards actually introducing plugsched (most preferrably by making mainline scheduler the builtin default and optionally building a plugsched kernel which then allows selection). There are fundamental behavioural differences between the various CPU scheduler types developed; while some people want a very interactive system with in most(!) cases good latency and exploit-less operation, several others want a scheduler which provides very predictable latency, low overhead and additionally as much interactivity as this strict model can provide for. And then there are people who have very specific SMP requirements which both characteristic scheduler types may have trouble satisfying properly. And I really don't see much difference whatsoever to the I/O scheduler area: some people want predictable latency, while others want maximum throughput or fastest operation for seek-less flash devices (noop). Hardware varies similarly greatly has well: Some people have huge disk arrays or NAS, others have a single flash disk. Some people have a decaying UP machine, others have huge SMP farms. IMHO both areas are too varied, thus runtime or compile-time selection is justified for both areas, not simply for I/O schedulers only. I don't think anybody would want to introduce new very similar scheduler types just for the fun of it; development would center around improving the at most 3 or 4 different scheduler implementations (as is the case with I/O schedulers, BTW: there hasn't been an explosion of different variants either!). I think the whole discussion went on the wrong track when people somehow had the notion of making RSDL (and its later variants) the main scheduler for desktop machines, not just server operation. And this target of course (and rightfully so) prompted people to ask for interactivity similar to what the current scheduler achieves which RSDL cannot fully provide within its strict design, however. However having mainline remain the only scheduler doesn't seem to be such an attractive option either, e.g. due to its non-predictability, when there exist several alternatives with rather nice behaviour. Thus I'd still tend towards making things runtime-selectable, scheduler goals are just too varied to ever sufficiently achieve Best Results In Every Area (tm). Not to mention that making schedulers runtime-selectable would enable uncovering various application timing bugs much faster (e.g. the RPM timing issues that -ck managed to hit). Oh, and get well very soon, Con, Linux needs you, a lot :) Andreas Mohr ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 13:27 ` Andreas Mohr @ 2007-04-09 19:54 ` Rene Herman 0 siblings, 0 replies; 92+ messages in thread From: Rene Herman @ 2007-04-09 19:54 UTC (permalink / raw) To: Andreas Mohr Cc: Mike Galbraith, Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list On 04/09/2007 03:27 PM, Andreas Mohr wrote: > And I really don't see much difference whatsoever to the I/O scheduler > area: some people want predictable latency, while others want maximum > throughput or fastest operation for seek-less flash devices (noop). > Hardware varies similarly greatly has well: > Some people have huge disk arrays or NAS, others have a single flash disk. > Some people have a decaying UP machine, others have huge SMP farms. I do agree, and yes, I/O scheduling seems to not have suffered from the choice although I must say I'm not sure how much use each I/O scheduler individualy sees. If one CPU scheduler can be good enough then it would better to just have that one, but well, yes, maybe it can't. I certainly believe any one scheduler can't avoid breaking down onder some condition. Demand is just too varied. I find it interesting that you see SD as a server scheduler and I guess deterministic behaviour does point in that direction somewhat. I would be enabling it on the desktop though, which probably is _some_ argument on having multiple schedulers. Rene. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 12:14 ` Rene Herman 2007-04-09 13:27 ` Andreas Mohr @ 2007-04-09 14:15 ` Ingo Molnar 2007-04-09 17:05 ` Rene Herman 2007-04-09 17:10 ` Mike Galbraith 2 siblings, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-09 14:15 UTC (permalink / raw) To: Rene Herman Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton * Rene Herman <rene.herman@gmail.com> wrote: > To me, the example rather serves as confirmation of what Kolivas has > been saying; endlessly tweaking the tweaks isn't going anywhere. but ... SD clearly regresses in some areas, so by that logic SD isnt going anywhere either? note that i still like the basic idea about SD, that it is an experiment that if the only conceptual focus is on "scheduling fairness", we'll get a better scheduler. But for that to work out two things have to be done i think: - the code actually has to match that stated goal. Right now it diverges from it (it is not a "fair" scheduler), and it's not clear why. note that SD at the moment produces ~10% more code in sched.o, and the reason is that SD is more complex than the vanilla scheduler. People tend to get the impression that SD is simpler, partly because it is a net linecount win in sched.c, but many of the removed lines are comments. this "provide fairness" goal is quite important, because if SD's code is not only about providing fairness, what is the rest of the logic doing? Are they "tweaks", to achieve interactivity? If yes, why are they not marked as such? I.e. will we go down the _same_ road again, but this time with a much less clearly defined rule for what a "tweak" is? note that under the interactivity estimator it is not that hard to achieve forced "fairness". So _if_ we accept that scheduling must include a fair dose of heuristics (which i tend to think it has to), we are perhaps better off with an interactivity design that _accepts_ this fundamental fact and separates heuristics from core scheduling. Right now i dont see the SD proponents even _accepting_ that even the current SD code does include heuristics. the other one is: - the code has to demonstrate that it can flexibly react to various complaints of regressions. (I identified a few problem workloads that we tend to care about and i havent seen much progress with them - but i really reserve judgement about that, given Con's medical condition.) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 14:15 ` Ingo Molnar @ 2007-04-09 17:05 ` Rene Herman 2007-04-09 17:48 ` Ingo Molnar 0 siblings, 1 reply; 92+ messages in thread From: Rene Herman @ 2007-04-09 17:05 UTC (permalink / raw) To: Ingo Molnar Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton On 04/09/2007 04:15 PM, Ingo Molnar wrote: > * Rene Herman <rene.herman@gmail.com> wrote: > >> To me, the example rather serves as confirmation of what Kolivas >> has been saying; endlessly tweaking the tweaks isn't going >> anywhere. > > but ... SD clearly regresses in some areas, so by that logic SD isnt > going anywhere either? No. The logic isn't that (performance and other) characteristics must always be exactly the same between two schedulers, the logic is that having one of them turn into a contrived heap of heuristics where every progression on one front turns into a regression on another means that one is on a dead-end road. Now ofcourse, while not needing to behave the same in all conceivable situations, any alternative like SD needs to behave _well_ and for me, it is currently, while just using it. > note that i still like the basic idea about SD, that it is an > experiment that if the only conceptual focus is on "scheduling > fairness", we'll get a better scheduler. But for that to work out two > things have to be done i think: > > - the code actually has to match that stated goal. Right now it > diverges from it (it is not a "fair" scheduler), and it's not clear > why. I read most of the discussion centering around that specific point as well, and frankly, I mostly came away from it thinking "so what?". It seems this is largely an issue of you and Kolivas disagreeing on what needs to be called design and what needs to be called implementation, but more importantly I feel a solution is to just shy away from the inherently subjective word "fair". If you feel that some of the things SD does need to be called "unfair" as much as mainline, so be it, but do you think that SD is less _predictably_ fair or unfair than mainline? This is what I consider to be very important; if my retarted kid brother sometimes walk left and sometimes right when I tell him to walk forward, I can't go stand to the right and say "nono, forward I said". If on the right there's a highway, you can imagine what that means... All software is stupid, but the one that's predictably so allows you to compensate. > this "provide fairness" goal is quite important, because if SD's code > is not only about providing fairness, what is the rest of the logic > doing? Are they "tweaks", to achieve interactivity? If yes, why are > they not marked as such? I.e. will we go down the _same_ road again, > but this time with a much less clearly defined rule for what a > "tweak" is? One answer to that is that it's much less important what a tweak is as long as it's the same always. If I then don't like the definition I'll just define it the other way around privately and be done with it. I do believe that SDs objective is not fairness as such, it's predictability. Being "fair" was postulated as a condition for being so, but let's not put too much focus on that one point; it's a matter of definitions (and taste) and secondary. > So _if_ we accept that scheduling must include a fair dose of > heuristics (which i tend to think it has to), we are perhaps better > off with an interactivity design that _accepts_ this fundamental fact > and separates heuristics from core scheduling. I agree that the demands on a (one) general purpose scheduler are so diverse that it's impossible to have one that doesn't break down under some set of conditions. The mainline scheduler does so, and SD does so. What SD does is take some of the guesswork out of it. I haven't needed anything like it yet, but I wouldn't feel particularly bad about, say, renicing a kernel compile upon having audio stutter while I'm browsing eBay. The "I haven't needed anything like it" is important; I ofcourse only wouldn't mind it under the condition that what I consider loads that my desktop should be able to handle without problem don't need anything special. If I'd transpose this load onto the Pentium 1 that's sitting at my feet, I wouldn't mind at all though. > Right now i dont see the SD proponents even _accepting_ that even the > current SD code does include heuristics. > > the other one is: > > - the code has to demonstrate that it can flexibly react to various > complaints of regressions. With one important point -- if every single _change_ in behaviour is going to be defined a regression, then obviously noone will ever again be able to change anything fundamental. Behaviour is being changed since people see current behaviour as not being desireable. Predictability for one is in my opinion a strong enough "progression" that I'm willing to mark of a few "regressions" against it. > (I identified a few problem workloads that we tend to care about and > i havent seen much progress with them - but i really reserve > judgement about that, given Con's medical condition.) Indeed. Going forward with it while its main developer is out might be unwise ofcourse. From his emails I gather he'll be out for some time, but hey, after kernel N+1, there'll probably be a kernel N+2... I'd just hate to see this being blocked outright. It seems to be performing so nicely for me. Rene. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 17:05 ` Rene Herman @ 2007-04-09 17:48 ` Ingo Molnar 2007-04-09 19:09 ` Rene Herman 2007-04-09 19:56 ` Gene Heskett 0 siblings, 2 replies; 92+ messages in thread From: Ingo Molnar @ 2007-04-09 17:48 UTC (permalink / raw) To: Rene Herman Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton * Rene Herman <rene.herman@gmail.com> wrote: > > - the code actually has to match that stated goal. Right now it > > diverges from it (it is not a "fair" scheduler), and it's not > > clear why. > > I read most of the discussion centering around that specific point as > well, and frankly, I mostly came away from it thinking "so what?". > [...] it's important due to what Mike mentioned in the previous mail too: SD seems to be quite rigid in certain aspects. So if we end up with that fundamental rigidity we might as well be _very_ sure that it makes sense. Because otherwise there might be no other way out but to "revert the whole thing again". Today we always have the "tweak the interactivity estimator" route, because that code is not rigid at the core of the scheduler. > [...] one of them turn into a contrived heap of heuristics where every > progression on one front turns into a regression on another means that > one is on a dead-end road. that's not what i found when testing Mike's latest patches - they visibly improved those testcases, part of which were written to "exploit" heuristics, without regressing others. Several people reported improvements with those patches. Why was that possible without spending years on writing a new scheduler? Because the interactivity estimator is fundamentally _tweakable_. What you flag with sometimes derogative sentences as a weakness of the interactivity estimator is also its strength: tweakability is flexibility. And no, despite what you claim to be a "patchwork" it makes quite some sense: reward certain scheduling behavior and punish other type of behavior. That's what SD does too in the end. Sure, if your "reward" fights against the "punishment", they cancel out each other, or if the metrics used are just arbitrary and make no independent sense it's bad, but that's just plain bad engineering. Why didnt much happen in the past year or so? Frankly, due to lack of demand for change - because most people were just happy about it, or just not upset enough. And i know the types of complaints first-hand, the -rt tree is a _direct answer_ to desktop-space complaints of Linux and it includes a fair bit of scheduler changes too. Now that we have actual new testcases and people with complaints and their willingness to try patches, we can do something about it. > > the other one is: > > > > - the code has to demonstrate that it can flexibly react to various > > complaints of regressions. > > With one important point -- if every single _change_ in behaviour is > going to be defined a regression, then obviously noone will ever again > be able to change anything fundamental. [...] i didnt say that, in fact my first lkml comment about RSDL on lkml was the exact opposite, but you SD advocates are _still_ bickering about (and not accepting) fundamental things like Mike's make -j5 workload and flagging it as unrealistic, so until there's so much reality disconnect there's not much chance for this issue to progress i'm afraid. Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 17:48 ` Ingo Molnar @ 2007-04-09 19:09 ` Rene Herman 2007-04-09 19:56 ` Gene Heskett 1 sibling, 0 replies; 92+ messages in thread From: Rene Herman @ 2007-04-09 19:09 UTC (permalink / raw) To: Ingo Molnar Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton On 04/09/2007 07:48 PM, Ingo Molnar wrote: > i didnt say that, in fact my first lkml comment about RSDL on lkml > was the exact opposite, but you SD advocates are _still_ bickering > about (and not accepting) fundamental things like Mike's make -j5 > workload and flagging it as unrealistic, so until there's so much > reality disconnect there's not much chance for this issue to progress > i'm afraid. I suppose I'm lumped in with the "SD advocates" now but you will note that I haven't been bickering about make -j5 loads. You cut away the entire meat of my reply which was all that predictability harping. What I did say about make -j5 loads is that I do not think that they, under all circumstances, on all machines and at all cost, need to perform the same as currently if other situations improve. Do I want heuristics? Sure, I'm just saying the kernel is fundamentally incapable of getting it right all of the time and as such it should provide me with as many opportunities as possible at stepping in. That is, let me understand what it is and is going to be doing and then listen to me. I agree not a lot of progress is to be made if people keep ignoring each other like that but also while SD's author is offline. Let's just shelve it until he's back. Not bury though... Rene. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 17:48 ` Ingo Molnar 2007-04-09 19:09 ` Rene Herman @ 2007-04-09 19:56 ` Gene Heskett 1 sibling, 0 replies; 92+ messages in thread From: Gene Heskett @ 2007-04-09 19:56 UTC (permalink / raw) To: linux-kernel Cc: Ingo Molnar, Rene Herman, Mike Galbraith, Con Kolivas, Andrew Morton On Monday 09 April 2007, Ingo Molnar wrote:[...] > >i didnt say that, in fact my first lkml comment about RSDL on lkml was >the exact opposite, but you SD advocates are _still_ bickering about >(and not accepting) fundamental things like Mike's make -j5 workload and >flagging it as unrealistic, so until there's so much reality disconnect >there's not much chance for this issue to progress i'm afraid. Mikes -j5 workload is AFAIAC, a very realistic workload for building a kernel. My own script I just discovered was using -j8, and that was noticeable, but by no means a killing hit on my poor old Xp2800 Athlon. I pulled it back to 4 for this mornings build and the hit, while less, is still noticeable. Killer hit? No way. Using Mike's v4 patch I think it was called. > Ingo -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) When you jump for joy, beware that no-one moves the ground from beneath your feet. -- Stanislaw Lem, "Unkempt Thoughts" ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 12:14 ` Rene Herman 2007-04-09 13:27 ` Andreas Mohr 2007-04-09 14:15 ` Ingo Molnar @ 2007-04-09 17:10 ` Mike Galbraith 2 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 17:10 UTC (permalink / raw) To: Rene Herman Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list On Mon, 2007-04-09 at 14:14 +0200, Rene Herman wrote: > This turned into an interactivity thing, and while interactivity is in > fact better for a large majority of testers, that isn't what Kolivas' > scheduler is about. It's about predictability and leaving the dead-end > road of these endlesss tweaks, which then break previous tweaks, rinse, > repeat. To me, it's more than an interactivity thing. It is also about reacting to a dynamic environment, which the desktop is. SD is not dynamic. > It's unfortunate that Kolivas is having health problems currently, but I > certainly do hope that his scheduler finds its way into _a_ -rc1. He > said it was done... Well, there I disagree with him quite strongly, but it's not my decision what gets integrated into any tree but my own ;-) -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 18:51 ` Rene Herman 2007-04-09 4:23 ` Mike Galbraith @ 2007-04-09 13:53 ` Ingo Molnar 2007-04-09 15:37 ` Rene Herman 1 sibling, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-09 13:53 UTC (permalink / raw) To: Rene Herman Cc: Gene Heskett, linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list [-- Attachment #1: Type: text/plain, Size: 2249 bytes --] * Rene Herman <rene.herman@gmail.com> wrote: > > and that change from Mike responded to a testcase. Mike's latest > > changes (the ones you just tested) were mostly driven by actual > > testcases too, which measured long-term timeslice distribution > > fairness. > > Ah yes, that one. Here's the next one in that series: > > commit f1adad78dd2fc8edaa513e0bde92b4c64340245c > Author: Linus Torvalds <torvalds@g5.osdl.org> > Date: Sun May 21 18:54:09 2006 -0700 > > Revert "[PATCH] sched: fix interactive task starvation" yes - in hindsight i regret having asked Mike for a "simpler" patch, which turned out to be rushed and plain broke your setup: my bad. And i completely forgot about that episode, Mike did a stream of changes in that timeframe and this one was indeed reverted :-/ > It personally had me wonder if _anyone_ was testing this stuff... yes, i certainly tried it and it broke nothing, and it was in fact acked by Con too: > Signed-off-by: Mike Galbraith <efault@gmx.de> > Acked-by: Ingo Molnar <mingo@elte.hu> > Cc: Nick Piggin <nickpiggin@yahoo.com.au> > Acked-by: Con Kolivas <kernel@kolivas.org> > Signed-off-by: Andrew Morton <akpm@osdl.org> > Signed-off-by: Linus Torvalds <torvalds@osdl.org> but it broke your setup: > This reverts commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 (and > its dependent commit 8a5bc075b8d8cf7a87b3f08fad2fba0f5d13295e), > because of audio underruns. > > Reported by Rene Herman <rene.herman@keyaccess.nl>, who also > pinpointed the exact cause of the underruns: > > "Audio underruns galore, with only ogg123 and firefox (browsing > the GIT tree online is also a nice trigger by the way). > > If I back it out, everything is fine for me again." so reverting it was justified. Basically, the approach was that the vanilla scheduler is working reasonably well, and that any improvement to it must not cause regression in areas where it already works well. (it obviously must have been working on your audio setup to a certain degree if reverting Mike's patch made the underruns go away) In any case, it would be very nice if you could try Mike's latest patch, how does it work on your setup? (i've attached it) Ingo [-- Attachment #2: sched-mike-4.patch --] [-- Type: text/plain, Size: 31773 bytes --] On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote: > looks interesting - could you send the patch? Ok, this is looking/feeling pretty good in testing. Comments on fugliness etc much appreciated. Below the numbers is a snapshot of my experimental tree. It's a mixture of my old throttling/anti-starvation tree and the task promotion patch, with the addition of a scheduling class for interactive tasks to dish out some of that targeted unfairness I mentioned. SCHED_INTERACTIVE is also targeted at the scenario where X or one of it's clients uses enough CPU to end up in the expired array. (note: Xorg was not set SCHED_INTERACTIVE during the test runs below) -Mike top - 12:31:34 up 16 min, 13 users, load average: 7.37, 8.74, 6.58 PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ P COMMAND 6542 root 15 0 1568 108 24 S 43 0.0 0:58.98 1 fiftypercent 6540 root 17 0 1568 440 356 R 30 0.0 1:00.04 0 fiftypercent 6544 root 18 0 1568 108 24 R 28 0.0 0:58.36 0 fiftypercent 6541 root 20 0 1568 108 24 R 26 0.0 0:57.70 1 fiftypercent 6536 root 25 0 1436 356 296 R 24 0.0 0:45.76 1 chew 6538 root 25 0 1436 356 296 R 20 0.0 0:49.73 0 chew 6543 root 19 0 1568 108 24 R 19 0.0 0:58.04 1 fiftypercent 6409 root 15 0 154m 63m 27m R 2 6.3 0:13.09 0 amarokapp 6410 root 15 0 154m 63m 27m S 2 6.3 0:14.36 0 amarokapp 6376 root 15 0 2380 1092 764 R 2 0.1 0:15.63 0 top 5591 root 18 0 4736 1036 736 S 1 0.1 0:00.14 1 smpppd 5678 root 15 0 167m 24m 4848 S 1 2.4 0:19.37 0 Xorg 6202 root 15 0 32364 18m 12m S 1 1.8 0:04.25 1 konsole 50 lines from center of chew nailed to cpu0's log pid 6538, prio 0, out for 27 ms, ran for 1 ms, load 6% pid 6538, prio 0, out for 26 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 27 ms, ran for 7 ms, load 20% pid 6538, prio 0, out for 13 ms, ran for 5 ms, load 27% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 49% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 9 ms, ran for 6 ms, load 42% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 46% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 48% pid 6538, prio 0, out for 9 ms, ran for 27 ms, load 74% pid 6538, prio 0, out for 27 ms, ran for 4 ms, load 13% pid 6538, prio 0, out for 26 ms, ran for 5 ms, load 17% pid 6538, prio 0, out for 27 ms, ran for 5 ms, load 17% pid 6538, prio 0, out for 28 ms, ran for 6 ms, load 18% pid 6538, prio 0, out for 30 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 18 ms, ran for 5 ms, load 24% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 45% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 45% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 44% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 2 ms, ran for 7 ms, load 78% pid 6538, prio 0, out for 45 ms, ran for 22 ms, load 33% pid 6538, prio 0, out for 31 ms, ran for 2 ms, load 7% pid 6538, prio 0, out for 62 ms, ran for 1 ms, load 3% pid 6538, prio 0, out for 29 ms, ran for 3 ms, load 11% pid 6538, prio 0, out for 26 ms, ran for 4 ms, load 13% pid 6538, prio 0, out for 134 ms, ran for 5 ms, load 4% pid 6538, prio 0, out for 78 ms, ran for 2 ms, load 3% pid 6538, prio 0, out for 9 ms, ran for 3 ms, load 28% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 48% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 46% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 10 ms, ran for 7 ms, load 43% pid 6538, prio 0, out for 9 ms, ran for 6 ms, load 39% pid 6538, prio 0, out for 9 ms, ran for 7 ms, load 42% pid 6538, prio 0, out for 8 ms, ran for 7 ms, load 46% pid 6538, prio 0, out for 14 ms, ran for 6 ms, load 30% pid 6538, prio 0, out for 27 ms, ran for 3 ms, load 12% pid 6538, prio 0, out for 29 ms, ran for 4 ms, load 12% pid 6538, prio 0, out for 29 ms, ran for 4 ms, load 13% pid 6538, prio 0, out for 26 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 29 ms, ran for 5 ms, load 14% pid 6538, prio 0, out for 27 ms, ran for 4 ms, load 14% pid 6538, prio 0, out for 26 ms, ran for 5 ms, load 16% pid 6538, prio 0, out for 24 ms, ran for 6 ms, load 20% pid 6538, prio 0, out for 7 ms, ran for 7 ms, load 49% root@Homer: ./massive_intr 30 180 006502 00002373 006495 00002687 006518 00002417 006490 00002544 006500 00002417 006494 00002427 006498 00003032 006517 00003060 006505 00002401 006507 00002375 006514 00002398 006497 00002483 006506 00002388 006504 00002415 006510 00002472 006516 00002365 006509 00002441 006503 00002498 006512 00002930 006496 00002565 006492 00002389 006501 00002337 006508 00002395 006491 00002486 006499 00002394 006493 00002667 006515 00002569 006511 00002555 006513 00002637 006519 00002556 --- include/linux/sched.h | 7 include/linux/sysctl.h | 2 kernel/sched.c | 450 +++++++++++++++++++++++++++++++++++++++++++++---- kernel/sysctl.c | 39 +++- 4 files changed, 459 insertions(+), 39 deletions(-) Index: linux/include/linux/sched.h =================================================================== --- linux.orig/include/linux/sched.h +++ linux/include/linux/sched.h @@ -34,6 +34,7 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#define SCHED_INTERACTIVE 4 #ifdef __KERNEL__ @@ -528,7 +529,7 @@ struct signal_struct { #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) +#define is_rt_policy(p) ((p) == SCHED_RR || (p) == SCHED_FIFO) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) /* @@ -820,14 +821,14 @@ struct task_struct { #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif - unsigned long sleep_avg; + unsigned long sleep_avg, last_slice, throttle; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ enum sleep_type sleep_type; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int time_slice, slice_info; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; Index: linux/include/linux/sysctl.h =================================================================== --- linux.orig/include/linux/sysctl.h +++ linux/include/linux/sysctl.h @@ -165,6 +165,8 @@ enum KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_SCHED_THROTTLE1=77, /* int: throttling credit period 1 in secs */ + KERN_SCHED_THROTTLE2=78, /* int: throttling credit period 2 in secs */ }; Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -90,6 +90,20 @@ unsigned long long __attribute__((weak)) #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -109,6 +123,8 @@ unsigned long long __attribute__((weak)) #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define PCNT_PER_DYNPRIO (100 / MAX_BONUS) +#define INTERACTIVE_LIMIT (DEF_TIMESLICE * 4) /* * If a task is 'interactive' then we reinsert it in the active @@ -167,6 +183,133 @@ unsigned long long __attribute__((weak)) (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +#define INTERACTIVE_LIMIT_EXCEEDED(rq) \ + ((rq)->active->interactive_ticks + (rq)->expired->interactive_ticks > \ + INTERACTIVE_LIMIT) + +/* + * Interactive boost can lead to starvation if the decision to + * boost a task turns out to be a bad one. To combat this, we + * compute the sane upper limit for cpu usage 'slice_avg' based + * upon a task's sleep_avg, and use this information combined + * with a timer to determine when intervention is required. + * + * When a task is behaving as it's sleep_avg indicates it should, + * it's throttle is moved forward, otherwise it will timeout, and + * it's priority will be lowered. + * + * Throttling tunables. + * + * CREDIT_C1: The amount of cpu time in seconds that a new task + * will run completely free, ie the head start a task + * has before it has to push it's timer forward to avoid + * being throttled. Each conforming slice thereafter + * increases it's stored credit, and vice versa. + * + * CREDIT_C2: The maximum amount of CPU time in seconds a task + * can store for later use. When a task has no stored + * credit left, now is time C2. Tasks begin life with + * C1 seconds credit, ie C2 is C1 seconds in front of + * them, and the 'buffer' will grow in front of them + * if they perform in a conformant manner. The maximum + * credit that fits in 32 bits jiffies is 42949 seconds. + */ + +int credit_c1 = 0; +int credit_c2 = 14400; +int credit_max = 42949; + +#define C1 (credit_c1 * MAX_BONUS * HZ) +#define C2 (credit_c2 * MAX_BONUS * HZ + C1) +#define C3 (MAX_BONUS * C2) + +#define credit_exhausted(p, credit) \ + (time_after_eq(jiffies, (p)->throttle + (credit))) + +/* + * Masks for p->slice_info, formerly p->first_time_slice. + * SLICE_FTS: 0x80000000 Task is in it's first ever timeslice. + * SLICE_NEW: 0x40000000 Slice refreshed. + * SLICE_INT: 0x20000000 Task is a SCHED_INTERACTIVE task partner. + * SLICE_SPA: 0x1FFE0000 Spare bits. + * SLICE_LTS: 0x0001FF80 Last time slice + * SLICE_AVG: 0x0000007F Task slice_avg stored as percentage. + */ +#define SLICE_AVG_BITS 7 +#define SLICE_LTS_BITS 10 +#define SLICE_SPA_BITS 12 +#define SLICE_INT_BITS 1 +#define SLICE_NEW_BITS 1 +#define SLICE_FTS_BITS 1 + +#define SLICE_AVG_SHIFT 0 +#define SLICE_LTS_SHIFT (SLICE_AVG_SHIFT + SLICE_AVG_BITS) +#define SLICE_SPA_SHIFT (SLICE_LTS_SHIFT + SLICE_LTS_BITS) +#define SLICE_INT_SHIFT (SLICE_SPA_SHIFT + SLICE_SPA_BITS) +#define SLICE_NEW_SHIFT (SLICE_INT_SHIFT + SLICE_INT_BITS) +#define SLICE_FTS_SHIFT (SLICE_NEW_SHIFT + SLICE_NEW_BITS) + +#define INFO_MASK(x) ((1U << (x))-1) +#define SLICE_AVG_MASK (INFO_MASK(SLICE_AVG_BITS) << SLICE_AVG_SHIFT) +#define SLICE_LTS_MASK (INFO_MASK(SLICE_LTS_BITS) << SLICE_LTS_SHIFT) +#define SLICE_SPA_MASK (INFO_MASK(SLICE_SPA_BITS) << SLICE_SPA_SHIFT) +#define SLICE_INT_MASK (INFO_MASK(SLICE_INT_BITS) << SLICE_INT_SHIFT) +#define SLICE_NEW_MASK (INFO_MASK(SLICE_NEW_BITS) << SLICE_NEW_SHIFT) +#define SLICE_FTS_MASK (INFO_MASK(SLICE_FTS_BITS) << SLICE_FTS_SHIFT) + +/* p->slice_info access macros. */ +#define first_time_slice(p) ((p)->slice_info & SLICE_FTS_MASK) +#define set_first_time_slice(p) ((p)->slice_info |= SLICE_FTS_MASK) +#define clr_first_time_slice(p) ((p)->slice_info &= ~SLICE_FTS_MASK) + +#define slice_is_new(p) ((p)->slice_info & SLICE_NEW_MASK) +#define set_slice_is_new(p) ((p)->slice_info |= SLICE_NEW_MASK) +#define clr_slice_is_new(p) ((p)->slice_info &= ~SLICE_NEW_MASK) + +#define task_is_interactive(p) ((p)->slice_info & SLICE_INT_MASK) +#define set_task_is_interactive(p) ((p)->slice_info |= SLICE_INT_MASK) +#define clr_task_is_interactive(p) ((p)->slice_info &= ~SLICE_INT_MASK) + +#define last_slice(p) (((p)->slice_info & SLICE_LTS_MASK) >> SLICE_LTS_SHIFT) +#define set_last_slice(p, n) ((p)->slice_info = (((p)->slice_info & \ + ~SLICE_LTS_MASK) | (((n) << SLICE_LTS_SHIFT) & SLICE_LTS_MASK))) + +#define NS_SLEEP_AVG_PCNT (NS_MAX_SLEEP_AVG / 100) + +/* Note: raw storage format of slice_avg is %cpu. */ +#define slice_avg(p) ((typeof((p)->sleep_avg)) \ + ((((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) * \ + NS_SLEEP_AVG_PCNT)) +#define set_slice_avg(p, n) ((p)->slice_info = (((p)->slice_info & \ + ~SLICE_AVG_MASK) | ((((n) / NS_SLEEP_AVG_PCNT) \ + << SLICE_AVG_SHIFT) & SLICE_AVG_MASK))) +#define slice_avg_raw(p) \ + (((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) +#define set_slice_avg_raw(p, n) ((p)->slice_info = (((p)->slice_info & \ + ~SLICE_AVG_MASK) | (((n) << SLICE_AVG_SHIFT) & SLICE_AVG_MASK))) + +/* cpu usage macros. */ +#define cpu_avg(p) \ + (100 - slice_avg_raw(p)) + +#define cpu_max(p) \ + (100 - ((p)->sleep_avg / NS_SLEEP_AVG_PCNT)) + +#define time_this_slice(p) \ + (jiffies - (p)->last_slice) + +#define cpu_this_slice(p) \ + (100 * last_slice(p) / max((unsigned) time_this_slice(p), \ + (unsigned) last_slice(p))) + +#define cpu_avg_rq(rq) \ + (100 * DEF_TIMESLICE / max((unsigned) (rq)->slice_avg, \ + (unsigned) DEF_TIMESLICE)) + +/* Positively identified interactive tasks. */ +#define task_interactive(p) \ + ((p)->policy == SCHED_INTERACTIVE || task_is_interactive(p)) + #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) @@ -201,6 +344,7 @@ static inline unsigned int task_timeslic struct prio_array { unsigned int nr_active; + int interactive_ticks; DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_PRIO]; }; @@ -234,7 +378,8 @@ struct rq { */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; + unsigned long switch_timestamp; + unsigned long slice_avg; /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; @@ -691,6 +836,8 @@ static void dequeue_task(struct task_str list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + if (TASK_INTERACTIVE(p)) + array->interactive_ticks -= p->time_slice; } static void enqueue_task(struct task_struct *p, struct prio_array *array) @@ -700,6 +847,8 @@ static void enqueue_task(struct task_str __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + if (TASK_INTERACTIVE(p)) + array->interactive_ticks += p->time_slice; } /* @@ -882,7 +1031,11 @@ static int recalc_task_prio(struct task_ /* Caller must always ensure 'now >= p->timestamp' */ unsigned long sleep_time = now - p->timestamp; - if (batch_task(p)) + /* + * Migration timestamp adjustment may induce negative time. + * Ignore unquantifiable values as well as SCHED_BATCH tasks. + */ + if (now < p->timestamp || batch_task(p)) sleep_time = 0; if (likely(sleep_time > 0)) { @@ -893,7 +1046,14 @@ static int recalc_task_prio(struct task_ */ unsigned long ceiling = INTERACTIVE_SLEEP(p); - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { + /* + * Update throttle position. + */ + p->throttle += NS64_TO_JIFFIES(sleep_time); + if (time_before(jiffies, p->throttle)) + p->throttle = jiffies; + + if (sleep_time > ceiling && p->sleep_avg < ceiling) { /* * Prevents user tasks from achieving best priority * with one single large enough sleep. @@ -915,7 +1075,7 @@ static int recalc_task_prio(struct task_ * limited in their sleep_avg rise as they * are likely to be waiting on I/O */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { + if (p->sleep_type == SLEEP_NONINTERACTIVE) { if (p->sleep_avg >= ceiling) sleep_time = 0; else if (p->sleep_avg + sleep_time >= @@ -1531,16 +1691,23 @@ out_activate: * sleep_avg beyond just interactive state. */ p->sleep_type = SLEEP_NONINTERACTIVE; - } else + } else if (task_interactive(current)) { + /* + * Tasks tagged as being truly interactive + * pass temporary interactive status on to + * the task they are waking. + */ + set_task_is_interactive(p); + p->sleep_type = SLEEP_INTERACTIVE; + } /* * Tasks that have marked their sleep as noninteractive get * woken up with their sleep average not weighted in an * interactive way. */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - + else if (old_state & TASK_NONINTERACTIVE) + p->sleep_type = SLEEP_NONINTERACTIVE; activate_task(p, rq, cpu == this_cpu); /* @@ -1628,9 +1795,24 @@ void fastcall sched_fork(struct task_str * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ - p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); + + /* + * Set up slice_info and initial throttle position for the child. + */ + set_slice_avg(p, p->sleep_avg); + set_last_slice(p, p->time_slice); + set_slice_is_new(p); + set_first_time_slice(p); + p->last_slice = jiffies; + p->throttle = jiffies - C2 + C1; + /* + * SCHED_INTERACTIVE policy cannot be inherited. + */ + if (unlikely(current->policy == SCHED_INTERACTIVE)) + p->policy = SCHED_NORMAL; + if (unlikely(!current->time_slice)) { /* * This case is rare, it happens when the parent has only @@ -1745,7 +1927,7 @@ void fastcall sched_exit(struct task_str * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + if (first_time_slice(p) && task_cpu(p) == task_cpu(p->parent)) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > task_timeslice(p))) p->parent->time_slice = task_timeslice(p); @@ -3051,9 +3233,10 @@ static inline int expired_starving(struc { if (rq->curr->static_prio > rq->best_expired_prio) return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) + if (!STARVATION_LIMIT) return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) + if (jiffies - rq->switch_timestamp > rq->nr_running * DEF_TIMESLICE + + STARVATION_LIMIT) return 1; return 0; } @@ -3131,8 +3314,165 @@ void account_steal_time(struct task_stru cpustat->steal = cputime64_add(cpustat->steal, tmp); } +/* + * Promote and requeue the next lower priority task. If no task + * is available in the active array, switch to the expired array. + * @rq: runqueue to search. + * @prio: priority at which to begin search. + */ +static inline void promote_next_lower(struct rq *rq, int prio) +{ + struct prio_array *array = rq->active; + struct task_struct *p = NULL; + unsigned long long now = rq->most_recent_timestamp; + unsigned long *bitmap; + unsigned long starving = JIFFIES_TO_NS(rq->slice_avg); + int idx = prio + 1, found_noninteractive = 0; + int ticks = rq->active->interactive_ticks + rq->expired->interactive_ticks; + +repeat: + bitmap = array->bitmap; + idx = find_next_bit(bitmap, MAX_PRIO, idx); + if (idx < MAX_PRIO) { + struct list_head *queue = array->queue + idx; + + p = list_entry(queue->next, struct task_struct, run_list); + if (!TASK_INTERACTIVE(p)) + found_noninteractive = 1; + + /* Skip non-starved queues. */ + if (now < p->last_ran + starving) { + idx++; + p = NULL; + goto repeat; + } + } else if (!found_noninteractive && array == rq->active) { + /* Nobody home, check the expired array. */ + array = rq->expired; + idx = prio; + p = NULL; + goto repeat; + } + + /* Found one, requeue it. */ + if (p) { + dequeue_task(p, p->array); + if (array == rq->active) + p->prio--; + /* + * If we pulled a task from the expired array, correct + * expired array info. We can't afford a full search + * for best_expired_prio, but do the best we can. + */ + else { + idx = sched_find_first_bit(array->bitmap); + if (idx < MAX_PRIO) { + if (rq->best_expired_prio > idx) + rq->best_expired_prio = idx; + } else { + /* We emptied the array */ + rq->best_expired_prio = MAX_PRIO; + /* + * If we have excessive interactive load, + * do not inhibit forced array switching. + */ + if (ticks < INTERACTIVE_LIMIT) + rq->switch_timestamp = jiffies; + } + } + enqueue_task(p, rq->active); + } +} + +/* + * Refresh timeslice and associated slice information. + * @p: the process to refresh. + */ +static void refresh_timeslice(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + unsigned long slice_time = jiffies - p->last_slice; + int idle, cpu, cpu_avg, slice = last_slice(p); + int w = MAX_BONUS, delta, bonus; + + if (unlikely(slice_time < slice)) + slice_time = slice; + + /* Update task's CPU usage. */ + cpu_avg = slice_avg_raw(p); + cpu = cpu_this_slice(p); + idle = 100 - cpu; + delta = max(cpu_avg, idle) - min(cpu_avg, idle); + w = 1 + (delta / w); + cpu_avg = (w * cpu_avg + idle) / (w + 1); + set_slice_avg_raw(p, cpu_avg); + + /* + * If we've hit the throttle timeout, we aren't draining enough + * sleep_avg to keep up with the task's cpu usage. Up the ante + * to bring the task back toward balance. + */ + if (credit_exhausted(p, C2) && p->sleep_avg > slice_avg(p)) { + unsigned long run_time = p->sleep_avg - slice_avg(p); + run_time /= w; + if (p->sleep_avg >= run_time) + p->sleep_avg -= run_time; + } + + /* + * Update throttle position and sanity check it. + */ + if (task_is_interactive(p)) + p->throttle += slice_time - slice; + else if (INTERACTIVE_LIMIT_EXCEEDED(rq) && + cpu_avg - cpu_avg_rq(rq) >= PCNT_PER_DYNPRIO) { + bonus = (cpu_avg - cpu_avg_rq(rq)) / PCNT_PER_DYNPRIO; + p->throttle -= slice_time * bonus; + } else if (cpu < cpu_max(p) + PCNT_PER_DYNPRIO) { + bonus = idle * PCNT_PER_DYNPRIO / 100; + p->throttle += (slice_time - slice) * bonus; + } else if (cpu >= cpu_max(p) + PCNT_PER_DYNPRIO) { + bonus = (cpu - cpu_max(p)) / PCNT_PER_DYNPRIO; + p->throttle -= slice_time * bonus; + } + + if (time_before(jiffies, p->throttle)) + p->throttle = jiffies; + else if (credit_exhausted(p, C3)) + p->throttle = jiffies - C3; + + /* Add our slice time to the runqueue average. */ + if (slice_time < HZ || slice_time < rq->nr_running * DEF_TIMESLICE) { + rq->slice_avg <<= 4; + rq->slice_avg += slice_time; + rq->slice_avg >>= 4; + } + + /* + * Ensure that SCHED_INTERACTIVE tasks and their partners will + * always be classified correctly by TASK_INTERACTIVE(). Clear + * propogated interactive task status. Propogated status is + * inherited from the parent, but is good for only one slice. + */ + if (task_is_interactive(p) && p->sleep_avg < INTERACTIVE_SLEEP(p)) + p->sleep_avg = INTERACTIVE_SLEEP(p); + clr_task_is_interactive(p); + + /* Update dynamic priority and time slice. */ + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + set_last_slice(p, p->time_slice); + + /* And finally, stamp and flag the new slice. */ + clr_first_time_slice(p); + set_slice_is_new(p); + p->last_slice = jiffies; +} + static void task_running_tick(struct rq *rq, struct task_struct *p) { + int task_was_interactive; + if (p->array != rq->active) { /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); @@ -3152,8 +3492,7 @@ static void task_running_tick(struct rq * FIFO tasks have no timeslices. */ if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + refresh_timeslice(p); set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -3161,21 +3500,36 @@ static void task_running_tick(struct rq } goto out_unlock; } + + /* + * Tick off interactive task ticks from the active array. + */ + task_was_interactive = TASK_INTERACTIVE(p); + if (task_was_interactive && --rq->active->interactive_ticks < 0) + rq->active->interactive_ticks = 0; + if (!--p->time_slice) { dequeue_task(p, rq->active); + refresh_timeslice(p); set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { + + if (!TASK_INTERACTIVE(p) || expired_starving(rq) || + credit_exhausted(p, C2)) { enqueue_task(p, rq->expired); if (p->static_prio < rq->best_expired_prio) rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); + + /* + * Always look to see if any queue under you is starving, + * and requeue a task if that is the case. This prevents + * things like multiple tasks at any priority waking in + * streams and starving their less fortunate peers via + * preempt, ie ensures that the less fortunate will have + * bounded latency. + */ + promote_next_lower(rq, p->prio); } else { /* * Prevent a too long timeslice allowing a task to monopolize @@ -3285,7 +3639,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx, new_prio; + int cpu, idx, new_prio, throttle; long *switch_count; struct rq *rq; @@ -3332,9 +3686,13 @@ need_resched_nonpreemptible: /* * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); + * delay them losing their interactive status. If we have too many + * interactive ticks queued or this task is being throttled, switch + * behavior to linear decay. + */ + throttle = INTERACTIVE_LIMIT_EXCEEDED(rq) || credit_exhausted(prev, C2); + if (!throttle) + run_time /= 1 + CURRENT_BONUS(prev); spin_lock_irq(&rq->lock); @@ -3356,7 +3714,7 @@ need_resched_nonpreemptible: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; + rq->switch_timestamp = jiffies; goto switch_tasks; } } @@ -3370,7 +3728,8 @@ need_resched_nonpreemptible: rq->active = rq->expired; rq->expired = array; array = rq->active; - rq->expired_timestamp = 0; + array->interactive_ticks = 0; + rq->switch_timestamp = jiffies; rq->best_expired_prio = MAX_PRIO; } @@ -3380,6 +3739,8 @@ need_resched_nonpreemptible: if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; + int next_interactive = TASK_INTERACTIVE(next); + if (unlikely((long long)(now - next->timestamp) < 0)) delta = 0; @@ -3389,14 +3750,33 @@ need_resched_nonpreemptible: array = next->array; new_prio = recalc_task_prio(next, next->timestamp + delta); + /* + * If INTERACTIVE_LIMIT is exceeded, do not promote + * tasks which already have interactive status. This + * can only make things worse if the load isn't truly + * interactive, so let them decay. We also don't want + * a task which has been promoted while waiting to + * get CPU after wakeup to be demoted, and thus end + * up being preempted immediately by a task waking + * at the priority it has just reached. Tasks which + * miss the tick frequently also get caught here, so + * care has to be taken to not help them along. Since + * these are very likely to have interactive status, + * don't ever demote a non-interactive task here, and + * always considered interactive tasks to be fair game. + */ + if ((throttle && next_interactive && new_prio < next->prio) || + (!next_interactive && new_prio > next->prio)) + goto switch_tasks; + if (unlikely(next->prio != new_prio)) { dequeue_task(next, array); next->prio = new_prio; enqueue_task(next, array); } } - next->sleep_type = SLEEP_NORMAL; switch_tasks: + next->sleep_type = SLEEP_NORMAL; if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); @@ -3411,6 +3791,14 @@ switch_tasks: prev->sleep_avg = 0; prev->timestamp = prev->last_ran = now; + /* + * Tag start of execution of a new timeslice. + */ + if (unlikely(slice_is_new(next))) { + next->last_slice = jiffies; + clr_slice_is_new(next); + } + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = next->last_ran = now; @@ -4081,7 +4469,8 @@ recheck: if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH) + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_INTERACTIVE) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are @@ -4619,6 +5008,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_INTERACTIVE: ret = 0; break; } @@ -4643,6 +5033,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_INTERACTIVE: ret = 0; } return ret; @@ -6739,6 +7130,7 @@ void __init sched_init(void) rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; + rq->slice_avg = STARVATION_LIMIT; #ifdef CONFIG_SMP rq->sd = NULL; Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int credit_c1; +extern int credit_c2; +extern int credit_max; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -204,6 +207,13 @@ static ctl_table root_table[] = { { .ctl_name = 0 } }; +/* + * Constants for minimum and maximum testing in vm_table and + * kern_table. We use these as one-element integer vectors. +*/ +static int zero; +static int one_hundred = 100; + static ctl_table kern_table[] = { { .ctl_name = KERN_PANIC, @@ -611,16 +621,31 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - + { + .ctl_name = KERN_SCHED_THROTTLE1, + .procname = "credit_c1", + .data = &credit_c1, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &credit_max, + }, + { + .ctl_name = KERN_SCHED_THROTTLE2, + .procname = "credit_c2", + .data = &credit_c2, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &credit_max, + }, { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY, ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 13:53 ` Ingo Molnar @ 2007-04-09 15:37 ` Rene Herman 0 siblings, 0 replies; 92+ messages in thread From: Rene Herman @ 2007-04-09 15:37 UTC (permalink / raw) To: Ingo Molnar Cc: Gene Heskett, linux-kernel, Con Kolivas, Mike Galbraith, Andrew Morton, ck list On 04/09/2007 03:53 PM, Ingo Molnar wrote: > In any case, it would be very nice if you could try Mike's latest > patch, how does it work on your setup? (i've attached it) Can do. Note that "my setup" in that case consisted of browsing around eBay in firefox with ogg123 playing audio directly to ALSA in an xterm as the only other thing running. That is, just about as basic a Linux desktop as imagineable. Testing Mike's latest will have to wait a bit though; I'm currently testing the latest incarnation of SD (against 2.6.20.6). For people who've lost track of what and where, it's available as: http://ck.kolivas.org/patches/staircase-deadline/2.6.20.5-sd-0.39.patch and versus 2.6.21-rc5 as: http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc5-sd-0.39.patch For the moment it is giving me a snappy feeling desktop on this Duron 1300, with ogg123 playing in an xterm without audio underruns, with a make -j2 kernel compile running (not niced) and me browsing around in firefox. Mike latest would probably also support this load without much problem. Given that I feel the basic idea of SD is better than mainline though, I'll be concentrating on using SD for a bit for now. Rene. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 18:08 ` Ingo Molnar 2007-04-07 18:23 ` Gene Heskett @ 2007-04-07 19:14 ` Mike Galbraith 2007-04-07 20:31 ` Gene Heskett 2007-04-09 17:51 ` William Lee Irwin III 1 sibling, 2 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-07 19:14 UTC (permalink / raw) To: Ingo Molnar Cc: Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list On Sat, 2007-04-07 at 20:08 +0200, Ingo Molnar wrote: > * Gene Heskett <gene.heskett@gmail.com> wrote: > > (who the hell runs a 'make -j 200' or 50 while(1)'s in the real world? > > not many - and i dont think Mike tested any of these - Mike tested > pretty low make -j values (Mike, can you confirm?). Yes. I don't test anything more than make -j5 when looking at interactivity, and make -j nr_cpus+1 is my must have yardstick. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 19:14 ` Mike Galbraith @ 2007-04-07 20:31 ` Gene Heskett 2007-04-09 17:51 ` William Lee Irwin III 1 sibling, 0 replies; 92+ messages in thread From: Gene Heskett @ 2007-04-07 20:31 UTC (permalink / raw) To: linux-kernel Cc: Mike Galbraith, Ingo Molnar, Con Kolivas, Andrew Morton, ck list On Saturday 07 April 2007, Mike Galbraith wrote: >On Sat, 2007-04-07 at 20:08 +0200, Ingo Molnar wrote: >> * Gene Heskett <gene.heskett@gmail.com> wrote: >> > (who the hell runs a 'make -j 200' or 50 while(1)'s in the real >> > world? >> >> not many - and i dont think Mike tested any of these - Mike tested >> pretty low make -j values (Mike, can you confirm?). > >Yes. I don't test anything more than make -j5 when looking at >interactivity, and make -j nr_cpus+1 is my must have yardstick. > > -Mike Somebody made that remark, maybe not you, and maybe they were being funny, but I didn't at the time, see any smileys. -- Cheers, Gene "There are four boxes to be used in defense of liberty: soap, ballot, jury, and ammo. Please use in that order." -Ed Howdershelt (Author) Please remain calm, it's no use both of us being hysterical at the same time. ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 19:14 ` Mike Galbraith 2007-04-07 20:31 ` Gene Heskett @ 2007-04-09 17:51 ` William Lee Irwin III 2007-04-09 18:03 ` Ingo Molnar 1 sibling, 1 reply; 92+ messages in thread From: William Lee Irwin III @ 2007-04-09 17:51 UTC (permalink / raw) To: Mike Galbraith Cc: Ingo Molnar, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton, ck list On Sat, 2007-04-07 at 20:08 +0200, Ingo Molnar wrote: >> not many - and i dont think Mike tested any of these - Mike tested >> pretty low make -j values (Mike, can you confirm?). On Sat, Apr 07, 2007 at 09:14:21PM +0200, Mike Galbraith wrote: > Yes. I don't test anything more than make -j5 when looking at > interactivity, and make -j nr_cpus+1 is my must have yardstick. I strongly suggest assembling a battery of cleanly and properly written, configurable testcases, and scripting a series of regression tests as opposed to just randomly running kernel compiles and relying on Braille. For instance, a program that spawns a set of tasks with some spectrum of interactive vs. noninteractive behaviors and maybe priorities too according to command-line flags and then measures and reports the distribution of CPU bandwidth between them, with some notion of success or failure and performance within the realm of success reported would be something to include in such a battery of testcases. Different sorts of cooperating processes attempting to defeat whatever sorts of guarantees the scheduler is intended to provide would also be good testcases, particularly if they're arranged so as to automatically report success or failure in their attempts to defeat the scheduler (which even irman2.c, while quite good otherwise, fails to do). IMHO the failure of these threads to converge to some clear conclusion is in part due to the lack of an agreed-upon set of standards for what the scheduler should achieve and overreliance on subjective criteria. The testcase code going around is also somewhat embarrassing. >From the point of view of someone wondering what these schedulers solve, how any of this is to be demonstrated, and what the status of various pathological cases are, these threads are a nightmare of subjective squishiness and a tug-of-war between testcases only ever considered one at a time needing Lindent to read that furthermore have all their parameters hardcoded. Scripting edits and recompiles is awkward. Just finding the testcases is also awkward; con has a collection of a few, but they've got the aforementioned flaws and others also go around that can only be dredged up from mailing list archive searches, plus there's nothing like LTP where they can be run in a script with pass/fail reports and/or performance metrics for each. One patch goes through for one testcase and regressions against the others are open questions. Scheduling does have a strong subjective component, but this is too disorganized to be allowed to pass without comment. Some minimum bar must be set for schedulers to pass before they're considered correct. Some method of regression testing must be arranged. And the code to do such testing should not be complete crap with hardcoded parameters. -- wli ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 17:51 ` William Lee Irwin III @ 2007-04-09 18:03 ` Ingo Molnar 2007-04-09 18:44 ` William Lee Irwin III 0 siblings, 1 reply; 92+ messages in thread From: Ingo Molnar @ 2007-04-09 18:03 UTC (permalink / raw) To: William Lee Irwin III Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton * William Lee Irwin III <wli@holomorphy.com> wrote: > I strongly suggest assembling a battery of cleanly and properly > written, configurable testcases, and scripting a series of regression > tests as opposed to just randomly running kernel compiles and relying > on Braille. there's interbench, written by Con (with the purpose of improving RSDL/SD), which does exactly that, but vanilla and SD performs quite the same in those tests. it's quite hard to test interactivity, because it's both subjective and because even for objective workloads, things depend so much on exact circumstances. So the best way is to wait for actual complaints, and/or actual testcases that trigger badness, and victims^H^H^H^H^H testers. (also note that often it needs _that precise_ workload to trigger some badness. For example make -j depends on the kind of X shell terminal that is used - gterm behaves differently from xterm, etc.) Ingo ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 18:03 ` Ingo Molnar @ 2007-04-09 18:44 ` William Lee Irwin III 0 siblings, 0 replies; 92+ messages in thread From: William Lee Irwin III @ 2007-04-09 18:44 UTC (permalink / raw) To: Ingo Molnar Cc: Mike Galbraith, Gene Heskett, linux-kernel, Con Kolivas, Andrew Morton * William Lee Irwin III <wli@holomorphy.com> wrote: >> I strongly suggest assembling a battery of cleanly and properly >> written, configurable testcases, and scripting a series of regression >> tests as opposed to just randomly running kernel compiles and relying >> on Braille. On Mon, Apr 09, 2007 at 08:03:56PM +0200, Ingo Molnar wrote: > there's interbench, written by Con (with the purpose of improving > RSDL/SD), which does exactly that, but vanilla and SD performs quite the > same in those tests. > it's quite hard to test interactivity, because it's both subjective and > because even for objective workloads, things depend so much on exact > circumstances. So the best way is to wait for actual complaints, and/or > actual testcases that trigger badness, and victims^H^H^H^H^H testers. > (also note that often it needs _that precise_ workload to trigger some > badness. For example make -j depends on the kind of X shell terminal > that is used - gterm behaves differently from xterm, etc.) Interactivity will probably have to stay squishy. The DoS affairs like fiftyp.c, tenp.c, etc. are more of what I had in mind. There are also a number of instances where CPU bandwidth distributions are gauged by top(1) with noninteractive tests where the scriptable testcase affair should be coming into play. There are other, relatively obvious testcases for basic functionality missing, too. For instance, where is the testcase to prove that nice levels have the intended effect upon CPU bandwidth distribution between sets of CPU-bound tasks? Or one that gauges the CPU bandwidth distribution between a task that sleeps some (command-line configurable) percentage of the time and some (command-line configurable) number of competing CPU-bound tasks? Or one that gauges the CPU bandwidth distribution between sets of cooperating processes competing with ordinary CPU-bound processes? Can it be proven that any of this is staying constant across interactivity or other changes? Is any of it being changed as an unintended side-effect? Are the CPU bandwidth distributions among such sets of competing tasks even consciously decided? There should be readily-available answers to these questions, but they are not so. -- wli ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 6:50 ` Con Kolivas 2007-04-07 16:12 ` Gene Heskett @ 2007-04-07 16:32 ` Mike Galbraith 2007-04-08 13:08 ` Ed Tomlinson 2 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-07 16:32 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Sat, 2007-04-07 at 16:50 +1000, Con Kolivas wrote: > On Friday 06 April 2007 20:03, Ingo Molnar wrote: > > (There was one person who > > reported wide-scale interactivity regressions against mainline but he > > didnt answer my followup posts to trace/debug the scenario.) > > That was one user. As I mentioned in an earlier thread, the problem with email > threads on drawn out issues on lkml is that all that people remember is the > last one creating noise, and that has only been the noise from Mike for 2 > weeks now. This doesn't even deserve a reply, so I'll just say "get well soon". -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-07 6:50 ` Con Kolivas 2007-04-07 16:12 ` Gene Heskett 2007-04-07 16:32 ` Mike Galbraith @ 2007-04-08 13:08 ` Ed Tomlinson 2007-04-09 5:38 ` Mike Galbraith 2 siblings, 1 reply; 92+ messages in thread From: Ed Tomlinson @ 2007-04-08 13:08 UTC (permalink / raw) To: Con Kolivas Cc: Ingo Molnar, Mike Galbraith, linux list, Andrew Morton, ck list Hi, I am one of those who have been happily testing Con's patches. They work better than mainline here. There seems to be a disconnect on what Con is trying to achieve with SD. They do not improve interactivity per say. Instead they make the scheduler predictable by removing the alchemy used by the interactivity estimator. Mikes patches may be better alchemy but they continue down the same path - from prior experience, we can say with fairly good confidence, that there will be new corner cases that trigger problems. With SD, if you ask too much of the machine it slows down. You can fix this, if required, by renicing tasks some tasks - or by reducing the load on the box. If one really needs some sort of interactivity booster (I do not with SD), why not move it into user space? With SD it would be simple enough to export some info on estimated latency. With this user space could make a good attempt to keep latency within bounds for a set of tasks just by renicing.... Thanks Ed Tomlinson PS. Get well soon Con. On Saturday 07 April 2007 02:50, Con Kolivas wrote: > On Friday 06 April 2007 20:03, Ingo Molnar wrote: > > * Con Kolivas <kernel@kolivas.org> wrote: > > > > I was more focused on the general case, but all I should have to do > > > > to de-claw all of these sleep exploits is account rr time (only a > > > > couple of lines, done and building now). It's only a couple of > > > > lines. > > > > > > The more you try to "de-claw" these sleep exploits the less effective > > > you make your precious interactive estimator. Feel free to keep adding > > > endless tweaks to undo the other tweaks in order to try and achieve > > > what SD has by design. > > > > firstly, testing on various workloads Mike's tweaks work pretty well, > > while SD still doesnt handle the high-load case all that well. Note that > > it was you who raised this whole issue to begin with: everything was > > pretty quiet in scheduling interactivity land. > > I'm terribly sorry but you have completely missed my intentions then. I was > _not_ trying to improve mainline's interactivity at all. My desire was to fix > the unfairness that mainline has, across the board without compromising > fairness. You said yourself that an approach that fixed a lot and had a small > number of regressions would be worth it. In a surprisingly ironic turnaround > two bizarre things happened. People found SD fixed a lot of their > interactivity corner cases which were showstoppers. That didn't surprise me > because any unfair design will by its nature get it wrong sometimes. The even > _more_ surprising thing is that you're now using interactivity as the > argument against SD. I did not set out to create better interactivity, I set > out to create widespread fairness without too much compromise to > interactivity. As I said from the _very first email_, there would be cases of > interactivity in mainline that performed better. > > > (There was one person who > > reported wide-scale interactivity regressions against mainline but he > > didnt answer my followup posts to trace/debug the scenario.) > > That was one user. As I mentioned in an earlier thread, the problem with email > threads on drawn out issues on lkml is that all that people remember is the > last one creating noise, and that has only been the noise from Mike for 2 > weeks now. Has everyone forgotten the many many users who reported the > advantages first up which generated the interest in the first place? Why have > they stopped reporting? Well the answer is obvious; all the signs suggest > that SD is slated for mainline. It is on the path, Linus has suggested it and > now akpm is asking if it's ready for 2.6.22. So they figure there is no point > testing and replying any further. SD is ready for prime time, finalised and > does everything I intended it to. This is where I have to reveal to them the > horrible truth. This is no guarantee it will go in. In fact, this one point > that you (Ingo) go on and on about is not only a quibble, but you will call > it an absolute showstopper. As maintainer of the cpu scheduler, in its > current form you will flatly refuse it goes to mainline citing the 5% of > cases where interactivity has regressed. So people will tell me to fix it, > right?... Read on for this to unfold. > > > SD has a built-in "interactivity estimator" as well, but hardcoded into > > its design. SD has its own set of ugly-looking tweaks as well - for > > example the prio_matrix. > > I'm sorry but this is a mis-representation to me, as I suggested on an earlier > thread where I disagree about what an interactivity estimator is. The idea of > fence posts in a clock that are passed as a way of metering out > earliest-deadline-first in a design is well established. The matrix is simply > an array designed for O(1) lookups of the fence posts. That is not the same > as "oh how much have we slept in the last $magic_number period and how much > extra time should we get for that". > > > So it all comes down on 'what interactivity > > heuristics is enough', and which one is more tweakable. So far i've yet > > to see SD address the hackbench and make -j interactivity > > problems/regression for example, while Mike has been busy addressing the > > 'exploits' reported against mainline. > > And BANG there is the bullet you will use against SD from here to eternity. SD > obeys fairness at all costs. Your interactivity regression is that SD causes > progressive slowdown with load which by definition is fairness. You > repeatedly ask me to address it and there is on unfailing truth; the only way > to address it is to add unfairness to the design. So why don't I? Because the > simple fact is that any unfairness no matter how carefully administered or > metered will always have cases where it's wrong. Look at the title of this > email for example - it's yet another exploit for the mainline sleep/run > mechanism. This does _not_ mean I'm implying people are logging into servers > and running ./tenp to hang the machine. What it demonstrates is a way of > reproducing the scenario which is biting people with real world loads. It's > entirely believable that a simple p2p app could be behaving like tenp, only > generating a small load and it could take ages to log in and use the console. > Willy has complained this is why people stick to 2.4. Sure I can create > interactivity tweaks worse than anyone else. I will not, though, because that > precisely undoes what is special about SD. It never looks backwards, and is > predictable to absurdity. So you'll argue that mainline can manage it > below... > > > > You'll end up with an incresingly complex state machine design of > > > interactivity tweaks and interactivity throttlers all fighting each > > > other to the point where the intearactivity estimator doesn't do > > > anything. [...] > > > > It comes down to defining interactivity by scheduling behavior, and > > making that definition flexible. SD's definition of interactivity is > > rigid (but it's still behavior-based, so not fundamentally different > > from an explicit 'interactivity estimator'), and currently it does not > > work well under high load. But ... i'm still entertaining the notion > > that it might be good enough, but you've got to demonstrate the design's > > flexibility. > > I have yet to see someone find an "exploit" for SD's current design. Mainline > is all about continually patching up the intrinsic design (and fixing this > one test case is not the be all and end all). > > > furthermore, your description does not match my experience when using > > Mike's tweaks and comparing it to SD on the same hardware. According to > > your claim i should have seen regressions popping up in various, > > already-fixed corners, but it didnt happen in practice. But ... i'm > > awaiting further SD and Mike tweaks, the race certainly looks > > interesting ;) > > Well you see a race. I do not. I see a flat predictable performance from SD > where there will always be slowdown with load. I have no intention of > changing that. Mike is making an admirable attempt to fix issues as they are > pointed out. You say there are no regressions but I see absolutely no testers > of his patches besides himself and you. If I introduce any unfairness based > on sleep behaviour into SD I'll be undoing the whole point of the design and > end up chasing new regressions. So I won't quibble over the numbers. SD has > produced a lot of improvements and fairness that mainline struggles with ever > increasing patches to emulate, but SD does so at the expense of proportional > slowdown with load. At least I accept that and will no longer put my health > at risk trying to "fix" it by "breaking" it. SD is done. > > I feel sorry for the many users out there who are simply "waiting for it to > end up in mainline" who just discovered you will veto it on that basis. > lwn.net had it wrong; this was far more painful than any previous attempt to > get anything into mainline. > > My health has been so badly affected by this that I've been given an ultimatum > and must turn my computer off till I get well now which may be weeks. I > already know the massive flameage and last-word comments that are likely to > be fired off before the inevitable decision to veto it. > > > Ingo > > さようなら > ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-08 13:08 ` Ed Tomlinson @ 2007-04-09 5:38 ` Mike Galbraith 2007-04-09 11:26 ` Ed Tomlinson 2007-04-10 2:39 ` Mike Galbraith 0 siblings, 2 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 5:38 UTC (permalink / raw) To: Ed Tomlinson; +Cc: Con Kolivas, Ingo Molnar, linux list, Andrew Morton, ck list On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote: > Hi, > > I am one of those who have been happily testing Con's patches. > > They work better than mainline here. (I tried a UP kernel yesterday, and even a single kernel build would make noticeable hitches if I move a window around. YMMV etc.) > If one really needs some sort of interactivity booster (I do not with SD), why > not move it into user space? With SD it would be simple enough to export > some info on estimated latency. With this user space could make a good > attempt to keep latency within bounds for a set of tasks just by renicing.... I don't think you can have very much effect on latency using nice with SD once the CPU is fully utilized. See below. /* * This contains a bitmap for each dynamic priority level with empty slots * for the valid priorities each different nice level can have. It allows * us to stagger the slots where differing priorities run in a way that * keeps latency differences between different nice levels at a minimum. * ie, where 0 means a slot for that priority, priority running from left to * right: * nice -20 0000000000000000000000000000000000000000 * nice -10 1001000100100010001001000100010010001000 * nice 0 0101010101010101010101010101010101010101 * nice 5 1101011010110101101011010110101101011011 * nice 10 0110111011011101110110111011101101110111 * nice 15 0111110111111011111101111101111110111111 * nice 19 1111111111111111111011111111111111111111 */ Nice allocates bandwidth, but as long as the CPU is busy, tasks always proceed downward in priority until they hit the expired array. That's the design. If X gets busy and expires, and a nice 20 CPU hog wakes up after it's previous rotation has ended, but before the current rotation is ended (ie there is 1 task running at wakeup time), X will take a guaranteed minimum 160ms latency hit (quite noticeable) independent of nice level. The only way to avoid it is to use a realtime class. A nice -20 task has maximum bandwidth allocated, but that also makes it a bigger target for preemption from tasks at all nice levels as it proceeds downward toward expiration. AFAIKT, low latency scheduling just isn't possible once the CPU becomes 100% utilized, but it is bounded to runqueue length. In mainline OTOH, a nice -20 task will always preempt a nice 0 task, giving it instant gratification, and latency of lower priority tasks is bounded by the EXPIRED_STARVING(rq) safety net. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 5:38 ` Mike Galbraith @ 2007-04-09 11:26 ` Ed Tomlinson 2007-04-09 16:50 ` Mike Galbraith 2007-04-10 2:39 ` Mike Galbraith 1 sibling, 1 reply; 92+ messages in thread From: Ed Tomlinson @ 2007-04-09 11:26 UTC (permalink / raw) To: Mike Galbraith Cc: Con Kolivas, Ingo Molnar, linux list, Andrew Morton, ck list On Monday 09 April 2007 01:38, Mike Galbraith wrote: > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote: > > Hi, > > > > I am one of those who have been happily testing Con's patches. > > > > They work better than mainline here. > > (I tried a UP kernel yesterday, and even a single kernel build would > make noticeable hitches if I move a window around. YMMV etc.) Interesting. I run UP amd64, 1000HZ, 1.25G, preempt off (on causes kernel stalls with no messages - but that is another story). I do not notice a single make. When several are running the desktop slows down a bit. I do not have X niced. Wonder why we see such different results? I am not saying that SD is perfect - I fully expect that more bugs will turn up in its code (some will affect mainline too). I do however like the idea of a scheduler that does not need alchemy to achieve good results. Nor do I necessarily expect it to be 100% transparent. If one changes something as basic as the scheduler some tweaking should be expected. IMO this is fine as long as we get consistant results. > > If one really needs some sort of interactivity booster (I do not with SD), why > > not move it into user space? With SD it would be simple enough to export > > some info on estimated latency. With this user space could make a good > > attempt to keep latency within bounds for a set of tasks just by renicing.... > > I don't think you can have very much effect on latency using nice with > SD once the CPU is fully utilized. See below. > > /* > * This contains a bitmap for each dynamic priority level with empty slots > * for the valid priorities each different nice level can have. It allows > * us to stagger the slots where differing priorities run in a way that > * keeps latency differences between different nice levels at a minimum. > * ie, where 0 means a slot for that priority, priority running from left to > * right: > * nice -20 0000000000000000000000000000000000000000 > * nice -10 1001000100100010001001000100010010001000 > * nice 0 0101010101010101010101010101010101010101 > * nice 5 1101011010110101101011010110101101011011 > * nice 10 0110111011011101110110111011101101110111 > * nice 15 0111110111111011111101111101111110111111 > * nice 19 1111111111111111111011111111111111111111 > */ > > Nice allocates bandwidth, but as long as the CPU is busy, tasks always > proceed downward in priority until they hit the expired array. That's > the design. If X gets busy and expires, and a nice 20 CPU hog wakes up > after it's previous rotation has ended, but before the current rotation > is ended (ie there is 1 task running at wakeup time), X will take a > guaranteed minimum 160ms latency hit (quite noticeable) independent of > nice level. The only way to avoid it is to use a realtime class. > > A nice -20 task has maximum bandwidth allocated, but that also makes it > a bigger target for preemption from tasks at all nice levels as it > proceeds downward toward expiration. AFAIKT, low latency scheduling > just isn't possible once the CPU becomes 100% utilized, but it is > bounded to runqueue length. In mainline OTOH, a nice -20 task will > always preempt a nice 0 task, giving it instant gratification, and > latency of lower priority tasks is bounded by the EXPIRED_STARVING(rq) > safety net. Mike I made no mention of low latency. I did mention predictable latency. If you are 100% utilized, and have a nice -20 task cpu hog, I would expect it to run and that it _should_ affect other tasks - thats why it runs with -20... This is why I suggest that user space may be a better place to boost interactive tasks. A daemon that posted a message telling me that the nice -20 cpu hog is causing 300ms delays for X would, IMHO, be a good thing. That same daemon could then propose a fix telling me the expected latencies and let me decide if I want to change priorities. It could also be set to automaticily adjust nice levels... Thanks Ed ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 11:26 ` Ed Tomlinson @ 2007-04-09 16:50 ` Mike Galbraith 2007-04-22 10:48 ` [ck] " Martin Steigerwald 0 siblings, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-09 16:50 UTC (permalink / raw) To: Ed Tomlinson; +Cc: Con Kolivas, Ingo Molnar, linux list, Andrew Morton, ck list On Mon, 2007-04-09 at 07:26 -0400, Ed Tomlinson wrote: > On Monday 09 April 2007 01:38, Mike Galbraith wrote: > > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote: > > > Hi, > > > > > > I am one of those who have been happily testing Con's patches. > > > > > > They work better than mainline here. > > > > (I tried a UP kernel yesterday, and even a single kernel build would > > make noticeable hitches if I move a window around. YMMV etc.) > > Interesting. I run UP amd64, 1000HZ, 1.25G, preempt off (on causes kernel > stalls with no messages - but that is another story). I do not notice a single > make. When several are running the desktop slows down a bit. I do not have > X niced. Wonder why we see such different results? Probably because with your processor, in general cc1 can get the job done faster, as can X. The latency big hit happens when you hit the end of the rotation. You simply don't hit it as often as I do. Anyone with an old PIII box should hit the wall very quickly indeed. I haven't had time to try it here. > I am not saying that SD is perfect - I fully expect that more bugs will turn up > in its code (some will affect mainline too). I do however like the idea of a > scheduler that does not need alchemy to achieve good results. Nor do I > necessarily expect it to be 100% transparent. If one changes something > as basic as the scheduler some tweaking should be expected. IMO this > is fine as long as we get consistant results. Alchemy is a rather colorful word for arithmetic, but I see your point. > > > If one really needs some sort of interactivity booster (I do not with SD), why > > > not move it into user space? With SD it would be simple enough to export > > > some info on estimated latency. With this user space could make a good > > > attempt to keep latency within bounds for a set of tasks just by renicing.... > > > > I don't think you can have very much effect on latency using nice with > > SD once the CPU is fully utilized. See below. > > > > /* > > * This contains a bitmap for each dynamic priority level with empty slots > > * for the valid priorities each different nice level can have. It allows > > * us to stagger the slots where differing priorities run in a way that > > * keeps latency differences between different nice levels at a minimum. > > * ie, where 0 means a slot for that priority, priority running from left to > > * right: > > * nice -20 0000000000000000000000000000000000000000 > > * nice -10 1001000100100010001001000100010010001000 > > * nice 0 0101010101010101010101010101010101010101 > > * nice 5 1101011010110101101011010110101101011011 > > * nice 10 0110111011011101110110111011101101110111 > > * nice 15 0111110111111011111101111101111110111111 > > * nice 19 1111111111111111111011111111111111111111 > > */ > > > > Nice allocates bandwidth, but as long as the CPU is busy, tasks always > > proceed downward in priority until they hit the expired array. That's > > the design. If X gets busy and expires, and a nice 20 CPU hog wakes up > > after it's previous rotation has ended, but before the current rotation > > is ended (ie there is 1 task running at wakeup time), X will take a > > guaranteed minimum 160ms latency hit (quite noticeable) independent of > > nice level. The only way to avoid it is to use a realtime class. > > > > A nice -20 task has maximum bandwidth allocated, but that also makes it > > a bigger target for preemption from tasks at all nice levels as it > > proceeds downward toward expiration. AFAIKT, low latency scheduling > > just isn't possible once the CPU becomes 100% utilized, but it is > > bounded to runqueue length. In mainline OTOH, a nice -20 task will > > always preempt a nice 0 task, giving it instant gratification, and > > latency of lower priority tasks is bounded by the EXPIRED_STARVING(rq) > > safety net. > > Mike I made no mention of low latency. You did say that Con's patch works better than mainline, and you seemed very much to be talking about the desktop. X very definitely is a latency sensitive application, and often a CPU hog to boot. The point I illustrated above is a salient point. If you don't want to hear about anything other than this idea about using nice from userland, skip to my last sentence :) > I did mention predictable latency. If > you are 100% utilized, and have a nice -20 task cpu hog, I would expect it to run > and that it _should_ affect other tasks - thats why it runs with -20... :-/ It does the user of X absolutely no good to be able to predict, as I did above, that we are absolutely _going_ to take a 160ms + remaining task ticks latency hit. Nice -20 was used only to show clearly what SD trades away, and it's not only the desktop it's trading for mundane latency, it's trading any possibility of low latency, and dismissing burst loads as if they don't even exist. The current scheduler is dynamic. SD is utterly rigid. Apply what I wrote to X at the recommended nice -10. It makes no difference what bandwidth you allocate if the latency sensitive application _will_ take a very major latency hit if it uses it. X does do that, so it will take those hits by design. > This is why I suggest that user space may be a better place to boost interactive > tasks. A daemon that posted a message telling me that the nice -20 cpu hog > is causing 300ms delays for X would, IMHO, be a good thing. That same daemon > could then propose a fix telling me the expected latencies and let me decide if > I want to change priorities. It could also be set to automaticily adjust nice levels... Re-read what I wrote. You simply can't get there from here, by design. If I'm wrong, someone please show me where. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [ck] Re: Ten percent test 2007-04-09 16:50 ` Mike Galbraith @ 2007-04-22 10:48 ` Martin Steigerwald 2007-04-22 11:15 ` Con Kolivas 0 siblings, 1 reply; 92+ messages in thread From: Martin Steigerwald @ 2007-04-22 10:48 UTC (permalink / raw) To: ck; +Cc: Mike Galbraith, Ed Tomlinson, Andrew Morton, linux list Am Montag 09 April 2007 schrieb Mike Galbraith: > On Mon, 2007-04-09 at 07:26 -0400, Ed Tomlinson wrote: > > On Monday 09 April 2007 01:38, Mike Galbraith wrote: > > > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote: > > > > Hi, > > > > > > > > I am one of those who have been happily testing Con's patches. > > > > > > > > They work better than mainline here. > > > > > > (I tried a UP kernel yesterday, and even a single kernel build > > > would make noticeable hitches if I move a window around. YMMV etc.) > > > > Interesting. I run UP amd64, 1000HZ, 1.25G, preempt off (on causes > > kernel stalls with no messages - but that is another story). I do > > not notice a single make. When several are running the desktop > > slows down a bit. I do not have X niced. Wonder why we see such > > different results? > > Probably because with your processor, in general cc1 can get the job > done faster, as can X. The latency big hit happens when you hit the > end of the rotation. You simply don't hit it as often as I do. Anyone > with an old PIII box should hit the wall very quickly indeed. I > haven't had time to try it here. Hi! I am running 2.6.20.7 + sd-0.44 on an IBM ThinkPad T23 that I use as my Amarok machine[1]. It has a Pentium 3 with 1.13 GHz using ondemand frequency scaling and XFS as filesystem. So far music playback has been perfect even when I had it building kernel packages while wildly clicking around starting apps and then moving the Amarok window like mad while solid window moving is enabled. Amarok / xine continued to play the music totally unimpressed of that. So for me from a users point of view who wants good music playback *no matter what*, this is already perfect. Also the desktop feels quite snappy to me. It was only slow on anything I/O bound but thats understandable IMHO when make-kpkg tar -bzips the kernel source while 20 KDE applications are starting and Amarok plays music. Should I try any specific tests? This also goes out to anybody else, especially to you, Con. So if you want me to run some benchmarks, please tell me. I am not experienced in benchmarking, but if you tell me what to do, I can try it out. I prefer benchmarks that do not disrupt music playback, but can run more aggressive benchmarks over night. I think it might be good to use a benchmark that isn't I/O bound to really test the scheduler... but as said I am no expert on that and real life loads usually are I/O bound as well. Have to have an carefully eye on the harddisk though... Apr 22 11:51:06 deepdance smartd[3116]: Device: /dev/sda, SMART Prefailure Attribute: 3 Spin_Up_Time changed from 154 to 150 (well threshold is at 033, so still plenty to go, hope it will take some time till the next change) [1] http://martin-steigerwald.de/amarok-machine/ ;) Regards, -- Martin 'Helios' Steigerwald - http://www.Lichtvoll.de GPG: 03B0 0D6C 0040 0710 4AFA B82F 991B EAAC A599 84C7 ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [ck] Re: Ten percent test 2007-04-22 10:48 ` [ck] " Martin Steigerwald @ 2007-04-22 11:15 ` Con Kolivas 0 siblings, 0 replies; 92+ messages in thread From: Con Kolivas @ 2007-04-22 11:15 UTC (permalink / raw) To: ck Cc: Martin Steigerwald, Ed Tomlinson, Mike Galbraith, Andrew Morton, linux list On Sunday 22 April 2007 20:48, Martin Steigerwald wrote: > Am Montag 09 April 2007 schrieb Mike Galbraith: > > On Mon, 2007-04-09 at 07:26 -0400, Ed Tomlinson wrote: > > > On Monday 09 April 2007 01:38, Mike Galbraith wrote: > > > > On Sun, 2007-04-08 at 09:08 -0400, Ed Tomlinson wrote: > > > > > Hi, > > > > > > > > > > I am one of those who have been happily testing Con's patches. > > > > > > > > > > They work better than mainline here. > > > > > > > > (I tried a UP kernel yesterday, and even a single kernel build > > > > would make noticeable hitches if I move a window around. YMMV etc.) > > > > > > Interesting. I run UP amd64, 1000HZ, 1.25G, preempt off (on causes > > > kernel stalls with no messages - but that is another story). I do > > > not notice a single make. When several are running the desktop > > > slows down a bit. I do not have X niced. Wonder why we see such > > > different results? > > > > Probably because with your processor, in general cc1 can get the job > > done faster, as can X. The latency big hit happens when you hit the > > end of the rotation. You simply don't hit it as often as I do. Anyone > > with an old PIII box should hit the wall very quickly indeed. I > > haven't had time to try it here. > > Hi! > > I am running 2.6.20.7 + sd-0.44 on an IBM ThinkPad T23 that I use as my > Amarok machine[1]. It has a Pentium 3 with 1.13 GHz using ondemand > frequency scaling and XFS as filesystem. > > So far music playback has been perfect even when I had it building kernel > packages while wildly clicking around starting apps and then moving the > Amarok window like mad while solid window moving is enabled. Amarok / > xine continued to play the music totally unimpressed of that. > > So for me from a users point of view who wants good music playback *no > matter what*, this is already perfect. Also the desktop feels quite > snappy to me. It was only slow on anything I/O bound but thats > understandable IMHO when make-kpkg tar -bzips the kernel source while 20 > KDE applications are starting and Amarok plays music. > > Should I try any specific tests? This also goes out to anybody else, > especially to you, Con. So if you want me to run some benchmarks, please > tell me. I am not experienced in benchmarking, but if you tell me what to > do, I can try it out. I prefer benchmarks that do not disrupt music > playback, but can run more aggressive benchmarks over night. I think it > might be good to use a benchmark that isn't I/O bound to really test the > scheduler... but as said I am no expert on that and real life loads > usually are I/O bound as well. > > Have to have an carefully eye on the harddisk though... > > Apr 22 11:51:06 deepdance smartd[3116]: Device: /dev/sda, SMART Prefailure > Attribute: 3 Spin_Up_Time changed from 154 to 150 > > (well threshold is at 033, so still plenty to go, hope it will take some > time till the next change) > > [1] http://martin-steigerwald.de/amarok-machine/ ;) Thanks for the report. In your case, you've done the testing I require; that for your workloads everything works as you'd desire it without obvious problems. Just keeping an eye on newer versions if you have the time and inclination and making sure that everything stays as you expect it would be the most helpful thing you can do. Thanks! -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-09 5:38 ` Mike Galbraith 2007-04-09 11:26 ` Ed Tomlinson @ 2007-04-10 2:39 ` Mike Galbraith 2007-04-10 11:23 ` Ed Tomlinson 1 sibling, 1 reply; 92+ messages in thread From: Mike Galbraith @ 2007-04-10 2:39 UTC (permalink / raw) To: LKML; +Cc: Con Kolivas, Ingo Molnar, Andrew Morton, ck list On Mon, 2007-04-09 at 07:38 +0200, Mike Galbraith wrote: > I don't think you can have very much effect on latency using nice with > SD once the CPU is fully utilized. See below. > > /* > * This contains a bitmap for each dynamic priority level with empty slots > * for the valid priorities each different nice level can have. It allows > * us to stagger the slots where differing priorities run in a way that > * keeps latency differences between different nice levels at a minimum. > * ie, where 0 means a slot for that priority, priority running from left to > * right: > * nice -20 0000000000000000000000000000000000000000 > * nice -10 1001000100100010001001000100010010001000 > * nice 0 0101010101010101010101010101010101010101 > * nice 5 1101011010110101101011010110101101011011 > * nice 10 0110111011011101110110111011101101110111 > * nice 15 0111110111111011111101111101111110111111 > * nice 19 1111111111111111111011111111111111111111 > */ > > Nice allocates bandwidth, but as long as the CPU is busy, tasks always > proceed downward in priority until they hit the expired array. That's > the design. There's another aspect of this that may require some thought - kernel threads. As load increases, so does rotation length. Would you really want CPU hogs routinely preempting house-keepers under load? -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-10 2:39 ` Mike Galbraith @ 2007-04-10 11:23 ` Ed Tomlinson 2007-04-10 12:04 ` Mike Galbraith 0 siblings, 1 reply; 92+ messages in thread From: Ed Tomlinson @ 2007-04-10 11:23 UTC (permalink / raw) To: Mike Galbraith; +Cc: LKML, Con Kolivas, Ingo Molnar, Andrew Morton, ck list On Monday 09 April 2007 22:39, Mike Galbraith wrote: > On Mon, 2007-04-09 at 07:38 +0200, Mike Galbraith wrote: > > > I don't think you can have very much effect on latency using nice with > > SD once the CPU is fully utilized. See below. > > > > /* > > * This contains a bitmap for each dynamic priority level with empty slots > > * for the valid priorities each different nice level can have. It allows > > * us to stagger the slots where differing priorities run in a way that > > * keeps latency differences between different nice levels at a minimum. > > * ie, where 0 means a slot for that priority, priority running from left to > > * right: > > * nice -20 0000000000000000000000000000000000000000 > > * nice -10 1001000100100010001001000100010010001000 > > * nice 0 0101010101010101010101010101010101010101 > > * nice 5 1101011010110101101011010110101101011011 > > * nice 10 0110111011011101110110111011101101110111 > > * nice 15 0111110111111011111101111101111110111111 > > * nice 19 1111111111111111111011111111111111111111 > > */ > > > > Nice allocates bandwidth, but as long as the CPU is busy, tasks always > > proceed downward in priority until they hit the expired array. That's > > the design. > > There's another aspect of this that may require some thought - kernel > threads. As load increases, so does rotation length. Would you really > want CPU hogs routinely preempting house-keepers under load? SD has a schedule batch nice level. This is good for tasks that want lots of cpu when they can get it. If you overload your cpu I expect the box to slow down - including kernel threads. If really required they can be started with a higher priority... Ed ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-10 11:23 ` Ed Tomlinson @ 2007-04-10 12:04 ` Mike Galbraith 0 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-10 12:04 UTC (permalink / raw) To: Ed Tomlinson; +Cc: LKML, Con Kolivas, Ingo Molnar, Andrew Morton, ck list On Tue, 2007-04-10 at 07:23 -0400, Ed Tomlinson wrote: > On Monday 09 April 2007 22:39, Mike Galbraith wrote: > > On Mon, 2007-04-09 at 07:38 +0200, Mike Galbraith wrote: > > > > > I don't think you can have very much effect on latency using nice with > > > SD once the CPU is fully utilized. See below. > > > > > > /* > > > * This contains a bitmap for each dynamic priority level with empty slots > > > * for the valid priorities each different nice level can have. It allows > > > * us to stagger the slots where differing priorities run in a way that > > > * keeps latency differences between different nice levels at a minimum. > > > * ie, where 0 means a slot for that priority, priority running from left to > > > * right: > > > * nice -20 0000000000000000000000000000000000000000 > > > * nice -10 1001000100100010001001000100010010001000 > > > * nice 0 0101010101010101010101010101010101010101 > > > * nice 5 1101011010110101101011010110101101011011 > > > * nice 10 0110111011011101110110111011101101110111 > > > * nice 15 0111110111111011111101111101111110111111 > > > * nice 19 1111111111111111111011111111111111111111 > > > */ > > > > > > Nice allocates bandwidth, but as long as the CPU is busy, tasks always > > > proceed downward in priority until they hit the expired array. That's > > > the design. > > > > There's another aspect of this that may require some thought - kernel > > threads. As load increases, so does rotation length. Would you really > > want CPU hogs routinely preempting house-keepers under load? > > SD has a schedule batch nice level. This is good for tasks that want lots > of cpu when they can get it. If you overload your cpu I expect the box > to slow down - including kernel threads. If really required they can be > started with a higher priority... Sure. Anything that is latency sensitive, and those kernel threads that are necessary for system function can be made RT to bypass the designed in latency. It's just another thing that should be considered before integration. Now if burst loads (only one of which it the desktop) would just cease to exist... -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: Ten percent test 2007-04-06 9:28 ` Con Kolivas 2007-04-06 10:03 ` Ingo Molnar @ 2007-04-06 10:48 ` Mike Galbraith 1 sibling, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-06 10:48 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Fri, 2007-04-06 at 19:28 +1000, Con Kolivas wrote: > On Friday 06 April 2007 19:07, Mike Galbraith wrote: > > On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote: > > > On Thursday 05 April 2007 21:54, Ingo Molnar wrote: > > > > - fiftyp.c: noticeable, but alot better than previously! > > > > > > fiftyp.c seems to have been stumbled across by accident as having an > > > effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I > > > suggest a ten percent version like the following would be more useful as > > > a test for the harmful effect discovered in fiftyp.c. (/me throws in > > > obligatory code style change). > > > > > > Starts 15 processes that sleep ten times longer than they run. Change > > > forks to 15 times the number of cpus you have and it should work on any > > > size hardware. > > > > I was more focused on the general case, but all I should have to do to > > de-claw all of these sleep exploits is account rr time (only a couple of > > lines, done and building now). It's only a couple of lines. > > The more you try to "de-claw" these sleep exploits the less effective you make > your precious interactive estimator. Feel free to keep adding endless tweaks > to undo the other tweaks in order to try and achieve what SD has by design. I haven't seen SD achieve what it's design docs claim yet, so yup, I'm going to keep right on trying to fix the corner cases in what we have that _does_ give me the interactivity I want. -Mike ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-04-03 5:31 ` Mike Galbraith 2007-04-03 6:00 ` Mike Galbraith 2007-04-03 6:01 ` Ingo Molnar @ 2007-04-03 10:57 ` Mike Galbraith 2 siblings, 0 replies; 92+ messages in thread From: Mike Galbraith @ 2007-04-03 10:57 UTC (permalink / raw) To: Con Kolivas; +Cc: Ingo Molnar, linux list, Andrew Morton, ck list On Tue, 2007-04-03 at 07:31 +0200, Mike Galbraith wrote: > On Tue, 2007-04-03 at 12:37 +1000, Con Kolivas wrote: > > On Thursday 29 March 2007 15:50, Mike Galbraith wrote: > > > On Thu, 2007-03-29 at 09:44 +1000, Con Kolivas wrote: > > > + * This contains a bitmap for each dynamic priority level with empty slots > > > + * for the valid priorities each different nice level can have. It allows > > > + * us to stagger the slots where differing priorities run in a way that > > > + * keeps latency differences between different nice levels at a minimum. > > > + * ie, where 0 means a slot for that priority, priority running from left > > > to + * right: > > > + * nice -20 0000000000000000000000000000000000000000 > > > + * nice -10 1001000100100010001001000100010010001000 > > > + * nice 0 0101010101010101010101010101010101010101 > > > + * nice 5 1101011010110101101011010110101101011011 > > > + * nice 10 0110111011011101110110111011101101110111 > > > + * nice 15 0111110111111011111101111101111110111111 > > > + * nice 19 1111111111111111111011111111111111111111 > > > > Try two instances of chew.c at _differing_ nice levels on one cpu on mainline, > > and then SD. This is why you can't renice X on mainline. > > How about something more challenging instead :) > > The numbers below are from my scheduler tree with massive_intr running > at nice 0, and chew at nice 5. Below these numbers are 100 lines from > the exact center of chew's output. > > (interactivity remains intact with this rather heavy load) > > root@Homer: ./massive_intr 30 180 > 005671 00001506 > 005657 00001506 > 005651 00001491 > 005647 00001466 > 005661 00001484 > 005660 00001475 > 005645 00001514 > 005668 00001384 > 005673 00001516 > 005656 00001449 > 005664 00001512 > 005659 00001507 > 005667 00001513 > 005663 00001521 > 005670 00001440 > 005649 00001522 > 005652 00001487 > 005648 00001405 > 005665 00001472 > 005669 00001418 > 005662 00001489 > 005674 00001523 > 005650 00001480 > 005655 00001476 > 005672 00001530 > 005653 00001463 > 005654 00001427 > 005646 00001499 > 005658 00001510 > 005666 00001476 Taking a little break from tinkering, I built/ran rsd-0.38 as well. While chew usually says "out for N < 500ms", I see spikes like those below the massive_intr numbers. root@Homer: ./massive_intr 30 180 (nice 0) 006596 00001346 006613 00001475 006605 00001463 006606 00001423 006598 00001279 006609 00001458 006600 00001378 006591 00001491 006610 00001413 006588 00001361 006602 00001401 006601 00001412 006607 00001373 006604 00001449 006599 00001398 006608 00001269 006611 00001464 006593 00001349 006614 00001335 006612 00001512 006615 00001422 006589 00001363 006617 00001362 006597 00001435 006592 00001354 006595 00001425 006616 00001348 006603 00001308 006594 00001360 006590 00001397 (spikes from run above) pid 6585, prio 0, out for 178 ms, ran for 12 ms, load 6% pid 6585, prio 0, out for 175 ms, ran for 13 ms, load 7% pid 6585, prio 0, out for 1901 ms, ran for 12 ms, load 0% pid 6585, prio 0, out for 61 ms, ran for 12 ms, load 17% ... pid 6585, prio 0, out for 148 ms, ran for 11 ms, load 7% pid 6585, prio 0, out for 229 ms, ran for 13 ms, load 5% pid 6585, prio 0, out for 182 ms, ran for 11 ms, load 6% pid 6585, prio 0, out for 1306 ms, ran for 11 ms, load 0% pid 6585, prio 0, out for 72 ms, ran for 12 ms, load 15% pid 6585, prio 0, out for 252 ms, ran for 11 ms, load 4% .... (spikes from massive_intr at nice 0 and chew at nice -20) pid 6547, prio -20, out for 132 ms, ran for 119 ms, load 47% pid 6547, prio -20, out for 52 ms, ran for 119 ms, load 69% pid 6547, prio -20, out for 4 ms, ran for 96 ms, load 95% pid 6547, prio -20, out for 1251 ms, ran for 24 ms, load 1% pid 6547, prio -20, out for 78 ms, ran for 1561 ms, load 95% pid 6547, prio -20, out for 89 ms, ran for 120 ms, load 57% pid 6547, prio -20, out for 69 ms, ran for 119 ms, load 63% pid 6547, prio -20, out for 4125 ms, ran for 119 ms, load 2% pid 6547, prio -20, out for 73 ms, ran for 119 ms, load 62% pid 6547, prio -20, out for 110 ms, ran for 120 ms, load 52% pid 6547, prio -20, out for 57 ms, ran for 119 ms, load 67% ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas 2007-03-28 17:34 ` [ck] " Prakash Punnoor 2007-03-28 18:48 ` Ingo Molnar @ 2007-03-29 6:36 ` Con Kolivas 2007-04-23 8:58 ` Andrew Morton 3 siblings, 0 replies; 92+ messages in thread From: Con Kolivas @ 2007-03-29 6:36 UTC (permalink / raw) To: linux list; +Cc: Andrew Morton, Ingo Molnar, Andy Whitcroft, ck list On Thursday 29 March 2007 02:37, Con Kolivas wrote: > I'm cautiously optimistic that we're at the thin edge of the bugfix wedge > now. My neck condition got a lot worse today. I'm forced offline for a week and will be uncontactable. -- -ck ^ permalink raw reply [flat|nested] 92+ messages in thread
* Re: [PATCH] sched: staircase deadline misc fixes 2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas ` (2 preceding siblings ...) 2007-03-29 6:36 ` Con Kolivas @ 2007-04-23 8:58 ` Andrew Morton 3 siblings, 0 replies; 92+ messages in thread From: Andrew Morton @ 2007-04-23 8:58 UTC (permalink / raw) To: Con Kolivas; +Cc: linux list, Ingo Molnar, Andy Whitcroft, ck list On Thu, 29 Mar 2007 02:37:38 +1000 Con Kolivas <kernel@kolivas.org> wrote: > test.kernel.org found some idle time regressions in the latest update to the > staircase deadline scheduler and Andy Whitcroft helped me track down the > offending problem which was present in all previous RSDL schedulers but > previously wouldn't be manifest without changes in nice. So here is a bugfix > for the set_load_weight being incorrectly set and a few other minor > improvements. Thanks Andy! > > I'm cautiously optimistic that we're at the thin edge of the bugfix wedge now. > > --- > set_load_weight() should be performed after p->quota is set. This fixes a > large SMP performance regression. > > Make sure rr_interval is never set to less than one jiffy. > > Some sanity checking in update_cpu_clock will prevent bogus sched_clock > values. > > SCHED_BATCH tasks should not set the rq->best_static_prio field. > > Correct sysctl rr_interval description to describe the value in milliseconds. > > Style fixes. > > Signed-off-by: Con Kolivas <kernel@kolivas.org> > > --- > Documentation/sysctl/kernel.txt | 8 ++-- > kernel/sched.c | 73 +++++++++++++++++++++++++++++----------- OK, this is bizarre. I'm getting this: [ 52.754522] RTNL: assertion failed at net/ipv4/devinet.c (1055) [ 52.758258] [<c02cb6f7>] inetdev_event+0x46/0x2d8 [ 52.762041] [<c01049c9>] show_trace_log_lvl+0x28/0x2c [ 52.765887] [<c0105482>] show_trace+0xf/0x13 [ 52.769627] [<c01054d7>] dump_stack+0x14/0x18 [ 52.773320] [<c029b22e>] rtnl_unlock+0xd/0x2f [ 52.776999] [<c029f410>] fib_rules_event+0x3a/0xeb [ 52.780678] [<c01236aa>] notifier_call_chain+0x2c/0x55 [ 52.784339] [<c012371a>] raw_notifier_call_chain+0x17/0x1b [ 52.787975] [<c0295984>] dev_open+0x63/0x6b [ 52.791587] [<c02944fd>] dev_change_flags+0x50/0x104 [ 52.795201] [<c02cbcf4>] devinet_ioctl+0x259/0x57b [ 52.798798] [<c02955b2>] dev_ifsioc+0x113/0x3a0 [ 52.802408] [<c028b127>] sock_ioctl+0x1a1/0x1c4 [ 52.805966] [<c028af86>] sock_ioctl+0x0/0x1c4 [ 52.809475] [<c0165969>] do_ioctl+0x19/0x4d [ 52.812977] [<c0165b99>] vfs_ioctl+0x1fc/0x216 [ 52.816478] [<c0165bff>] sys_ioctl+0x4c/0x65 [ 52.819944] [<c0103b68>] syscall_call+0x7/0xb [ 52.823395] ======================= [ 52.826923] RTNL: assertion failed at net/ipv4/igmp.c (1358) [ 52.830485] [<c02cf545>] ip_mc_up+0x35/0x59 [ 52.834034] [<c029b22e>] rtnl_unlock+0xd/0x2f [ 52.837569] [<c02cb7ed>] inetdev_event+0x13c/0x2d8 [ 52.841123] [<c01049c9>] show_trace_log_lvl+0x28/0x2c [ 52.844682] [<c0105482>] show_trace+0xf/0x13 [ 52.848227] [<c01054d7>] dump_stack+0x14/0x18 [ 52.851752] [<c029b22e>] rtnl_unlock+0xd/0x2f [ 52.855242] [<c029f410>] fib_rules_event+0x3a/0xeb [ 52.858734] [<c01236aa>] notifier_call_chain+0x2c/0x55 [ 52.862241] [<c012371a>] raw_notifier_call_chain+0x17/0x1b [ 52.865759] [<c0295984>] dev_open+0x63/0x6b [ 52.869191] [<c02944fd>] dev_change_flags+0x50/0x104 [ 52.872571] [<c02cbcf4>] devinet_ioctl+0x259/0x57b [ 52.875998] [<c02955b2>] dev_ifsioc+0x113/0x3a0 [ 52.879399] [<c028b127>] sock_ioctl+0x1a1/0x1c4 [ 52.882741] [<c028af86>] sock_ioctl+0x0/0x1c4 [ 52.886025] [<c0165969>] do_ioctl+0x19/0x4d [ 52.889292] [<c0165b99>] vfs_ioctl+0x1fc/0x216 [ 52.892534] [<c0165bff>] sys_ioctl+0x4c/0x65 [ 52.895760] [<c0103b68>] syscall_call+0x7/0xb [ 52.898982] ======================= [ 52.907714] RTNL: assertion failed at net/ipv4/igmp.c (1205) [ 52.910229] [<c02cf3b7>] ip_mc_inc_group+0x3c/0x195 [ 52.912771] [<c01054d7>] dump_stack+0x14/0x18 [ 52.915314] [<c02cf551>] ip_mc_up+0x41/0x59 [ 52.917856] [<c029b22e>] rtnl_unlock+0xd/0x2f [ 52.920411] [<c02cb7ed>] inetdev_event+0x13c/0x2d8 [ 52.922990] [<c01049c9>] show_trace_log_lvl+0x28/0x2c [ 52.925568] [<c0105482>] show_trace+0xf/0x13 [ 52.928101] [<c01054d7>] dump_stack+0x14/0x18 [ 52.930591] [<c029b22e>] rtnl_unlock+0xd/0x2f [ 52.933061] [<c029f410>] fib_rules_event+0x3a/0xeb [ 52.935551] [<c01236aa>] notifier_call_chain+0x2c/0x55 [ 52.938071] [<c012371a>] raw_notifier_call_chain+0x17/0x1b [ 52.940605] [<c0295984>] dev_open+0x63/0x6b [ 52.943141] [<c02944fd>] dev_change_flags+0x50/0x104 [ 52.945670] [<c02cbcf4>] devinet_ioctl+0x259/0x57b [ 52.948191] [<c02955b2>] dev_ifsioc+0x113/0x3a0 [ 52.950698] [<c028b127>] sock_ioctl+0x1a1/0x1c4 [ 52.953185] [<c028af86>] sock_ioctl+0x0/0x1c4 [ 52.955656] [<c0165969>] do_ioctl+0x19/0x4d [ 52.958122] [<c0165b99>] vfs_ioctl+0x1fc/0x216 [ 52.960590] [<c0165bff>] sys_ioctl+0x4c/0x65 [ 52.963058] [<c0103b68>] syscall_call+0x7/0xb [ 52.965523] ======================= and bisection shows that this patch is where it starts happening. I see no way in which this patch can cause ASSERT_RTNL to start triggering. Could be the there are dynamic changes which are triggering some problem in the networking tree, but the net code looks straightforward enough. Anyway, after a few such traces things seem to settle down and there are no apparent problems, so I guess I'll just ship it as-is. Config is http://userweb.kernel.org/~akpm/config-sony.txt ^ permalink raw reply [flat|nested] 92+ messages in thread
end of thread, other threads:[~2007-04-23 8:59 UTC | newest]
Thread overview: 92+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-28 16:37 [PATCH] sched: staircase deadline misc fixes Con Kolivas
2007-03-28 17:34 ` [ck] " Prakash Punnoor
2007-04-01 6:40 ` Prakash Punnoor
[not found] ` <b14e81f00704010724i3155a16en91074ab789416f3d@mail.gmail.com>
2007-04-01 20:03 ` Prakash Punnoor
2007-03-28 18:48 ` Ingo Molnar
2007-03-28 23:44 ` Con Kolivas
2007-03-29 5:50 ` Mike Galbraith
2007-03-29 6:29 ` Mike Galbraith
2007-03-29 6:54 ` Mike Galbraith
2007-03-29 8:18 ` Mike Galbraith
2007-03-29 12:55 ` [ck] " michael chang
2007-04-03 2:35 ` Con Kolivas
2007-04-03 2:37 ` Con Kolivas
2007-04-03 5:31 ` Mike Galbraith
2007-04-03 6:00 ` Mike Galbraith
2007-04-03 6:01 ` Ingo Molnar
2007-04-03 6:11 ` Mike Galbraith
2007-04-05 11:02 ` Mike Galbraith
2007-04-05 11:09 ` Ingo Molnar
2007-04-05 11:12 ` Mike Galbraith
2007-04-05 11:15 ` Ingo Molnar
2007-04-05 13:18 ` Johannes Stezenbach
2007-04-05 15:28 ` Mike Galbraith
2007-04-05 11:54 ` [test] sched: SD-latest versus Mike's latest Ingo Molnar
2007-04-05 12:10 ` Mike Galbraith
2007-04-05 12:12 ` Ingo Molnar
2007-04-05 12:24 ` Mike Galbraith
2007-04-05 16:08 ` Con Kolivas
2007-04-05 19:05 ` Ingo Molnar
2007-04-05 20:29 ` Mike Galbraith
2007-04-06 1:03 ` Ten percent test Con Kolivas
2007-04-06 9:07 ` Mike Galbraith
2007-04-06 9:28 ` Con Kolivas
2007-04-06 10:03 ` Ingo Molnar
2007-04-06 10:40 ` Mike Galbraith
2007-04-07 6:50 ` Con Kolivas
2007-04-07 16:12 ` Gene Heskett
2007-04-07 18:08 ` Ingo Molnar
2007-04-07 18:23 ` Gene Heskett
2007-04-07 18:52 ` Ingo Molnar
2007-04-07 20:30 ` Gene Heskett
2007-04-08 10:41 ` Ingo Molnar
2007-04-08 10:58 ` Ingo Molnar
2007-04-08 17:04 ` Gene Heskett
2007-04-09 4:03 ` Mike Galbraith
2007-04-09 4:08 ` Gene Heskett
2007-04-09 5:59 ` Mike Galbraith
2007-04-09 13:01 ` Gene Heskett
2007-04-08 11:33 ` Gene Heskett
2007-04-08 11:40 ` Mike Galbraith
2007-04-08 12:02 ` Mike Galbraith
2007-04-08 17:57 ` Gene Heskett
2007-04-09 4:19 ` Mike Galbraith
2007-04-09 5:23 ` Gene Heskett
2007-04-09 6:09 ` Mike Galbraith
2007-04-08 17:56 ` Gene Heskett
2007-04-09 4:17 ` Mike Galbraith
2007-04-09 5:16 ` Gene Heskett
2007-04-09 6:06 ` Mike Galbraith
2007-04-09 8:24 ` Mike Galbraith
2007-04-08 18:51 ` Rene Herman
2007-04-09 4:23 ` Mike Galbraith
2007-04-09 12:14 ` Rene Herman
2007-04-09 13:27 ` Andreas Mohr
2007-04-09 19:54 ` Rene Herman
2007-04-09 14:15 ` Ingo Molnar
2007-04-09 17:05 ` Rene Herman
2007-04-09 17:48 ` Ingo Molnar
2007-04-09 19:09 ` Rene Herman
2007-04-09 19:56 ` Gene Heskett
2007-04-09 17:10 ` Mike Galbraith
2007-04-09 13:53 ` Ingo Molnar
2007-04-09 15:37 ` Rene Herman
2007-04-07 19:14 ` Mike Galbraith
2007-04-07 20:31 ` Gene Heskett
2007-04-09 17:51 ` William Lee Irwin III
2007-04-09 18:03 ` Ingo Molnar
2007-04-09 18:44 ` William Lee Irwin III
2007-04-07 16:32 ` Mike Galbraith
2007-04-08 13:08 ` Ed Tomlinson
2007-04-09 5:38 ` Mike Galbraith
2007-04-09 11:26 ` Ed Tomlinson
2007-04-09 16:50 ` Mike Galbraith
2007-04-22 10:48 ` [ck] " Martin Steigerwald
2007-04-22 11:15 ` Con Kolivas
2007-04-10 2:39 ` Mike Galbraith
2007-04-10 11:23 ` Ed Tomlinson
2007-04-10 12:04 ` Mike Galbraith
2007-04-06 10:48 ` Mike Galbraith
2007-04-03 10:57 ` [PATCH] sched: staircase deadline misc fixes Mike Galbraith
2007-03-29 6:36 ` Con Kolivas
2007-04-23 8:58 ` Andrew Morton
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox