public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* O(1) scheduler starvation
@ 2003-06-18  7:53 Felipe Alfaro Solana
  2003-06-18 12:04 ` Mike Galbraith
  2003-06-18 16:51 ` William Lee Irwin III
  0 siblings, 2 replies; 15+ messages in thread
From: Felipe Alfaro Solana @ 2003-06-18  7:53 UTC (permalink / raw)
  To: davidm; +Cc: LKML

[-- Attachment #1: Type: text/plain, Size: 735 bytes --]

Hi!

I've been poking around and found the following link on O(1) scheduler
starvation problems:

http://www.hpl.hp.com/research/linux/kernel/o1-starve.php

The web page contains a small test program which supposedly is able to
make two processes starvate. However, I've been unable to reproduce what
is described in the above link. In fact, the CPU usage ratio ranges
between 60-40% or 70-30% in the worst cases.

I'm running 2.5.72-mm1 with Mike Galbraith's scheduler patches and a
small patch I made myself to improve interactivity (mainly, to stop XMMS
from skipping by adjusting some scheduler parameters).

What's going on here? Is the previous article outdated, or have the
starvation problems been definitively fixed?

Thanks!

[-- Attachment #2: galbraith.patch --]
[-- Type: text/x-patch, Size: 7376 bytes --]

diff -prauN linux-2.5.70-bk8/include/linux/sched.h galbraith-2.5.70-bk8-1/include/linux/sched.h
--- linux-2.5.70-bk8/include/linux/sched.h	Mon Jun  2 02:15:11 2003
+++ galbraith-2.5.70-bk8-1/include/linux/sched.h	Tue Jun  3 07:10:17 2003
@@ -336,7 +336,9 @@
 	prio_array_t *array;
 
 	unsigned long sleep_avg;
-	unsigned long last_run;
+	unsigned long long last_run;
+	unsigned int run_nsecs;
+	unsigned int sleep_nsecs;
 
 	unsigned long policy;
 	unsigned long cpus_allowed;
diff -prauN linux-2.5.70-bk8/kernel/sched.c galbraith-2.5.70-bk8-1/kernel/sched.c
--- linux-2.5.70-bk8/kernel/sched.c	Tue May 27 04:05:34 2003
+++ galbraith-2.5.70-bk8-1/kernel/sched.c	Tue Jun  3 07:10:18 2003
@@ -74,6 +74,12 @@
 #define MAX_SLEEP_AVG		(10*HZ)
 #define STARVATION_LIMIT	(10*HZ)
 #define NODE_THRESHOLD		125
+#define SCHED_NANOSECOND	1
+#define SCHED_SECOND		(1000000000 * SCHED_NANOSECOND)
+#define SCHED_TICK		(SCHED_SECOND / HZ)
+#define TICKS_PER_SECOND	(SCHED_SECOND / SCHED_TICK)
+
+extern unsigned long long monotonic_clock(void);
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -342,9 +348,23 @@
  */
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	long sleep_time = jiffies - p->last_run - 1;
+	unsigned long long now = monotonic_clock();
+	long long sleep =  now - p->last_run + p->sleep_nsecs;
+	int ticks = 0, requeue_waker = 0;
+
+	if (sleep >= SCHED_TICK) {
+		while (sleep >= SCHED_SECOND) {
+			sleep -= SCHED_SECOND;
+			ticks += TICKS_PER_SECOND;
+		}
+		while (sleep >= SCHED_TICK) {
+			sleep -= SCHED_TICK;
+			ticks++;
+		}
+		p->sleep_nsecs = sleep;
+	} else p->sleep_nsecs += sleep;
 
-	if (sleep_time > 0) {
+	if (ticks > 0) {
 		int sleep_avg;
 
 		/*
@@ -355,7 +375,7 @@
 		 * spends sleeping, the higher the average gets - and the
 		 * higher the priority boost gets as well.
 		 */
-		sleep_avg = p->sleep_avg + sleep_time;
+		sleep_avg = p->sleep_avg + ticks;
 
 		/*
 		 * 'Overflow' bonus ticks go to the waker as well, so the
@@ -363,8 +383,10 @@
 		 * boosting tasks that are related to maximum-interactive
 		 * tasks.
 		 */
-		if (sleep_avg > MAX_SLEEP_AVG)
+		if (sleep_avg > MAX_SLEEP_AVG) {
 			sleep_avg = MAX_SLEEP_AVG;
+			p->sleep_nsecs = 0;
+		}
 		if (p->sleep_avg != sleep_avg) {
 			p->sleep_avg = sleep_avg;
 			p->prio = effective_prio(p);
@@ -548,6 +570,8 @@
 	current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
 	p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
 	p->prio = effective_prio(p);
+	p->run_nsecs = 0;
+	p->sleep_nsecs = 0;
 	set_task_cpu(p, smp_processor_id());
 
 	if (unlikely(!current->array))
@@ -1147,6 +1171,49 @@
 		(jiffies - (rq)->expired_timestamp >= \
 			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
 
+inline void __scheduler_tick(runqueue_t *rq, task_t *p)
+{
+	unsigned long long now = monotonic_clock();
+	prio_array_t *array = rq->active;
+	int ticks;
+
+	p->run_nsecs += now - p->last_run;
+	/* Task might have expired already, but not scheduled off yet */
+	if (p->array != array) {
+		set_tsk_need_resched(p);
+		goto abort;
+	}
+	if (p->run_nsecs < SCHED_TICK || p->policy == SCHED_FIFO )
+		goto abort;
+
+	for (ticks = 0; p->run_nsecs >= SCHED_TICK; ticks++)
+		p->run_nsecs -= SCHED_TICK;
+	if (ticks > p->time_slice)
+		show_task(p);
+	if (p->sleep_avg > ticks)
+		p->sleep_avg -= ticks;
+	else
+		p->sleep_avg = 0;
+	p->time_slice -= ticks;
+
+	if (p->time_slice <= 0) {
+		dequeue_task(p, p->array);
+		p->prio = effective_prio(p);
+		p->time_slice = task_timeslice(p);
+		p->first_time_slice = 0;
+		set_tsk_need_resched(p);
+		if ((EXPIRED_STARVING(rq) && !rt_task(p)) ||
+				!TASK_INTERACTIVE(p)) {
+			array = rq->expired;
+			if (!rq->expired_timestamp)
+				rq->expired_timestamp = jiffies;
+		}
+		enqueue_task(p, array);
+	}
+abort:
+	p->last_run = monotonic_clock();
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -1159,11 +1226,12 @@
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
+	int idle = p == rq->idle;
 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
 
-	if (p == rq->idle) {
+	if (idle) {
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
 			kstat_cpu(cpu).cpustat.system += sys_ticks;
@@ -1171,8 +1239,7 @@
 			kstat_cpu(cpu).cpustat.iowait += sys_ticks;
 		else
 			kstat_cpu(cpu).cpustat.idle += sys_ticks;
-		rebalance_tick(rq, 1);
-		return;
+		goto out;
 	}
 	if (TASK_NICE(p) > 0)
 		kstat_cpu(cpu).cpustat.nice += user_ticks;
@@ -1180,11 +1247,6 @@
 		kstat_cpu(cpu).cpustat.user += user_ticks;
 	kstat_cpu(cpu).cpustat.system += sys_ticks;
 
-	/* Task might have expired already, but not scheduled off yet */
-	if (p->array != rq->active) {
-		set_tsk_need_resched(p);
-		goto out;
-	}
 	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
@@ -1194,42 +1256,10 @@
 	 * it possible for interactive tasks to use up their
 	 * timeslices at their highest priority levels.
 	 */
-	if (p->sleep_avg)
-		p->sleep_avg--;
-	if (unlikely(rt_task(p))) {
-		/*
-		 * RR tasks need a special form of timeslice management.
-		 * FIFO tasks have no timeslices.
-		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
-		}
-		goto out_unlock;
-	}
-	if (!--p->time_slice) {
-		dequeue_task(p, rq->active);
-		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
-			if (!rq->expired_timestamp)
-				rq->expired_timestamp = jiffies;
-			enqueue_task(p, rq->expired);
-		} else
-			enqueue_task(p, rq->active);
-	}
-out_unlock:
+	 __scheduler_tick(rq, p);
 	spin_unlock(&rq->lock);
 out:
-	rebalance_tick(rq, 0);
+	rebalance_tick(rq, idle);
 }
 
 void scheduling_functions_start_here(void) { }
@@ -1264,8 +1294,8 @@
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->last_run = jiffies;
 	spin_lock_irq(&rq->lock);
+	__scheduler_tick(rq, prev);
 
 	/*
 	 * if entering off of a kernel preemption go straight
@@ -1320,6 +1350,7 @@
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
+		next->last_run = prev->last_run;
 
 		prepare_arch_switch(rq, next);
 		prev = context_switch(rq, prev, next);
diff -prauN linux-2.5.70-bk8/arch/i386/kernel/timers/timer_tsc.c galbraith-2.5.70-bk8-1/arch/i386/kernel/timers/timer_tsc.c
--- linux-2.5.70-bk8/arch/i386/kernel/timers/timer_tsc.c	Mon Apr 21 08:11:07 2003
+++ galbraith-2.5.70-bk8-1/arch/i386/kernel/timers/timer_tsc.c	Tue Jun  3 07:10:18 2003
@@ -102,12 +102,13 @@
 static unsigned long long monotonic_clock_tsc(void)
 {
 	unsigned long long last_offset, this_offset, base;
+	unsigned long flags;
 	
 	/* atomically read monotonic base & last_offset */
-	read_lock_irq(&monotonic_lock);
+	read_lock_irqsave(&monotonic_lock, flags);
 	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
 	base = monotonic_base;
-	read_unlock_irq(&monotonic_lock);
+	read_unlock_irqrestore(&monotonic_lock, flags);
 
 	/* Read the Time Stamp Counter */
 	rdtscll(this_offset);

[-- Attachment #3: sched.patch --]
[-- Type: text/plain, Size: 695 bytes --]

--- old/kernel/sched.c	2003-06-17 21:04:21.240902000 +0200
+++ new/kernel/sched.c	2003-06-17 20:58:54.840902000 +0200
@@ -66,14 +66,14 @@
  * they expire.
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(200 * HZ / 1000)
+#define MAX_TIMESLICE		( 20 * HZ / 1000)
 #define CHILD_PENALTY		50
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
+#define PRIO_BONUS_RATIO	20
 #define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(10*HZ)
-#define STARVATION_LIMIT	(10*HZ)
+#define MAX_SLEEP_AVG		(2*HZ)
+#define STARVATION_LIMIT	(2*HZ)
 #define NODE_THRESHOLD		125
 #define SCHED_NANOSECOND	1
 #define SCHED_SECOND		(1000000000 * SCHED_NANOSECOND)

^ permalink raw reply	[flat|nested] 15+ messages in thread
* Re: O(1) scheduler starvation
@ 2003-06-18 20:44 Ricardo Galli
  2003-06-18 20:48 ` Marc-Christian Petersen
  0 siblings, 1 reply; 15+ messages in thread
From: Ricardo Galli @ 2003-06-18 20:44 UTC (permalink / raw)
  To: linux-kernel

> Have you seen this email I just posted to lkml?
>
> [PATCH] 2.5.72 O(1) interactivity bugfix
>
> It may be affecting the scheduler in all sorts of ways.

I've applied the patch you posted in your web page for 2.4.21-ck1. It feels 
more reactive but has an annoying collateral effect, it seems to slow down to 
new processes forked from an interactive program.

I see it when selecting a PGP signed message in kmail, it takes up to two 
seconds to show the message body. The lag wasn't so noticeable before.

Regards,
 
-- 
  ricardo galli       GPG id C8114D34

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2003-06-18 22:14 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-06-18  7:53 O(1) scheduler starvation Felipe Alfaro Solana
2003-06-18 12:04 ` Mike Galbraith
2003-06-18 12:16   ` Helge Hafting
2003-06-18 12:32     ` Mike Galbraith
2003-06-18 14:22   ` Felipe Alfaro Solana
2003-06-18 15:54     ` Mike Galbraith
2003-06-18 15:59       ` Con Kolivas
2003-06-18 16:29         ` Mike Galbraith
2003-06-18 21:30       ` Felipe Alfaro Solana
2003-06-18 22:32         ` Mike Galbraith
2003-06-18 16:52   ` William Lee Irwin III
2003-06-18 17:17     ` Mike Galbraith
2003-06-18 16:51 ` William Lee Irwin III
  -- strict thread matches above, loose matches on Subject: below --
2003-06-18 20:44 Ricardo Galli
2003-06-18 20:48 ` Marc-Christian Petersen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox