[PATCH] sched: Add SCHED_BGND (background) scheduling policy

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] sched: Add SCHED_BGND (background) scheduling policy
@ 2006-07-04 23:35 Peter Williams
  2006-07-05  0:14 ` Con Kolivas
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-04 23:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nick Piggin, Linux Kernel, Con Kolivas, Peter Williams,
	Ingo Molnar

Problem:

There is a genuine need for the ability to put tasks in the background
(a la the SCHED_IDLEPRIO policy in Con Kolivas's -sc kernels) as is
evidenced by comments in LKML re a desire for SCHED_BATCH tasks
to run completely in the background.

Solution:

Of course, one option would have been to just modify SCHED_BATCH so
that tasks with that policy run completely in the background but there is a
genuine need for a non background batch policy so the solution adopted
is to implementa a new policy SCHED_BGND.

SCHED_BATCH means that it's a normal process and should get a fair
share of the CPU in accordance with its "nice" setting but it is NOT an
interactive task and should NOT receive any of the special treatment
that a task that is adjudged to be interactive receives.  In particular,
it should always be moved to the expired array at the end of its time
slice as to do otherwise might result in CPU starvation for other tasks.

SCHED_BGND means it's totally unimportant and should only be given the
CPU if no one else wants it OR if not giving it the CPU could lead to
priority inversion or starvation of other tasks due to this tasks holding
system resources.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>

---
 include/linux/init_task.h |    6 -
 include/linux/sched.h     |   11 ++
 kernel/fork.c             |    1 
 kernel/mutex.c            |   28 ++++++-
 kernel/sched.c            |  183 ++++++++++++++++++++++++++++++++++++++--------
 5 files changed, 192 insertions(+), 37 deletions(-)

Index: MM-2.6.17-mm6/include/linux/init_task.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/init_task.h	2006-07-04 14:37:42.000000000 +1000
+++ MM-2.6.17-mm6/include/linux/init_task.h	2006-07-04 14:38:12.000000000 +1000
@@ -99,9 +99,9 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-20,					\
-	.static_prio	= MAX_PRIO-20,					\
-	.normal_prio	= MAX_PRIO-20,					\
+	.prio		= MAX_RT_PRIO+20,				\
+	.static_prio	= MAX_RT_PRIO+20,				\
+	.normal_prio	= MAX_RT_PRIO+20,				\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
Index: MM-2.6.17-mm6/include/linux/sched.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/sched.h	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/include/linux/sched.h	2006-07-04 14:38:12.000000000 +1000
@@ -34,6 +34,8 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+/* Scheduler class for background tasks */
+#define SCHED_BGND		4
 
 #ifdef __KERNEL__
 
@@ -503,13 +505,16 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define BGND_PRIO		(MAX_RT_PRIO + 40)
+/* add another slot for SCHED_BGND tasks */
+#define MAX_PRIO		(BGND_PRIO + 1)
 
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
 #define has_rt_policy(p) \
-	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
+	unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)
+#define bgnd_task(p)		(unlikely((p)->policy == SCHED_BGND))
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -810,6 +815,7 @@ struct task_struct {
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
+	unsigned int mutexes_held; /* for knowing when it's safe to repress SCHED_BGND tasks */
 	enum sleep_type sleep_type;
 
 	unsigned long policy;
@@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_UIWAKE	0x08000000	/* Just woken from uninterrptible sleep */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 
Index: MM-2.6.17-mm6/kernel/sched.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/sched.c	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/sched.c	2006-07-04 14:38:12.000000000 +1000
@@ -59,7 +59,7 @@
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
@@ -73,7 +73,7 @@
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define MAX_USER_PRIO		(USER_PRIO(BGND_PRIO))
 
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
@@ -171,7 +171,7 @@
  */
 
 #define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+	max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 
 static unsigned int static_prio_timeslice(int static_prio)
 {
@@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
 	return static_prio_timeslice(p->static_prio);
 }
 
+#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
+#define safe_to_background(p) \
+	(!((p)->mutexes_held || \
+	   (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
+
 /*
  * These are the runqueue data structures:
  */
@@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
 {
 	int bonus, prio;
 
+	/* Ensure that background tasks stay at BGND_PRIO */
+	if (bgnd_task(p) && safe_to_background(p))
+		return BGND_PRIO;
+
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
+	if (prio > BGND_PRIO-1)
+		prio = BGND_PRIO-1;
 	return prio;
 }
 
@@ -761,8 +770,18 @@ static void set_load_weight(struct task_
 		else
 #endif
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
-	} else
-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+	} else {
+		/*
+		 * Reduce the probability of a task escaping the background
+		 * due to load balancing leaving it on a lighly used CPU
+		 * Can't use zero as that would kill load balancing when only
+		 * background tasks are running.
+		 */
+		if (bgnd_task(p))
+			p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);
+		else
+			p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+	}
 }
 
 static inline void
@@ -834,7 +853,10 @@ static void __activate_task(struct task_
 {
 	struct prio_array *target = rq->active;
 
-	if (batch_task(p))
+	/* Don't punish batch tasks just tasks actually in the background
+	 * as anything else is counter productive from a system wide aspect
+	 */
+	if (task_in_background(p))
 		target = rq->expired;
 	enqueue_task(p, target);
 	inc_nr_running(p, rq);
@@ -942,6 +964,8 @@ static void activate_task(struct task_st
 	if (!rt_task(p))
 		p->prio = recalc_task_prio(p, now);
 
+	p->flags &= ~PF_UIWAKE;
+
 	/*
 	 * This checks to make sure it's not an uninterruptible task
 	 * that is now waking up.
@@ -1484,6 +1508,7 @@ out_activate:
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->sleep_type = SLEEP_NONINTERACTIVE;
+		p->flags |= PF_UIWAKE;
 	} else
 
 	/*
@@ -3024,6 +3049,48 @@ void scheduler_tick(void)
 		}
 		goto out_unlock;
 	}
+
+	if (bgnd_task(p)) {
+		/*
+		 * Do this even if there's only one task on the queue as
+		 * we want to set the priority low so that any waking tasks
+		 * can preempt.
+		 */
+		if (task_in_background(p)) {
+			/*
+			 * Tasks currently in the background will be
+			 * at BGND_PRIO priority and preemption
+			 * should be enough to keep them in check provided we
+			 * don't let them adversely effect tasks on the expired
+			 * array.
+			 */
+			if (!safe_to_background(p)) {
+				dequeue_task(p, rq->active);
+				p->prio = effective_prio(p);
+				enqueue_task(p, rq->active);
+			} else if (rq->expired->nr_active &&
+				   rq->best_expired_prio < p->prio) {
+				dequeue_task(p, rq->active);
+				enqueue_task(p, rq->expired);
+				set_tsk_need_resched(p);
+				goto out_unlock;
+			}
+		}
+		else if (safe_to_background(p)) {
+			dequeue_task(p, rq->active);
+			p->normal_prio = BGND_PRIO;
+			/* this should be safe for PI purposes */
+			p->prio = p->normal_prio;
+			enqueue_task(p, rq->expired);
+			/*
+			 * think about making this conditional to reduce
+			 * context switch rate
+			 */
+			set_tsk_need_resched(p);
+			goto out_unlock;
+		}
+	}
+
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
@@ -3033,6 +3100,11 @@ void scheduler_tick(void)
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
+		/*
+		 * No need to do anything special for background tasks here
+		 * as TASK_INTERACTIVE() should fail when they're in the
+		 * background.
+		 */
 		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
@@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct 
 }
 
 /*
+ * task time slice for SMT dependent idle purposes
+ */
+static unsigned int smt_timeslice(struct task_struct *p)
+{
+	if (task_in_background(p))
+		return 0;
+
+	return task_timeslice(p);
+}
+
+/*
+ * Is the thisp a higher priority task than thatp for SMT dependent idle
+ * purposes?
+ */
+static int task_priority_gt(const struct task_struct *thisp,
+			    const struct task_struct *thatp)
+{
+	if (task_in_background(thisp))
+	    return !task_in_background(thatp);
+
+	if (task_in_background(thatp))
+	    return 1;
+
+	return thisp->static_prio < thatp->static_prio;
+}
+
+/*
  * To minimise lock contention and not have to drop this_rq's runlock we only
  * trylock the sibling runqueues and bypass those runqueues if we fail to
  * acquire their lock. As we only trylock the normal locking order does not
@@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
 				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
 					ret = 1;
 		} else {
-			if (smt_curr->static_prio < p->static_prio &&
+			if (task_priority_gt(smt_curr, p) &&
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
-				smt_slice(smt_curr, sd) > task_timeslice(p))
+				smt_slice(smt_curr, sd) > smt_timeslice(p))
 					ret = 1;
 		}
 unlock:
@@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
 }
 
 /*
+ * Switch the active and expired arrays.
+ */
+static struct prio_array *switch_arrays(struct rq *rq, int best_active_prio)
+{
+	struct prio_array *array = rq->active;
+
+	schedstat_inc(rq, sched_switch);
+	rq->active = rq->expired;
+	rq->expired = array;
+	rq->expired_timestamp = 0;
+	rq->best_expired_prio = best_active_prio;
+
+	return rq->active;
+}
+
+/*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
@@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
 	}
 
 	array = rq->active;
-	if (unlikely(!array->nr_active)) {
-		/*
-		 * Switch the active and expired arrays.
-		 */
-		schedstat_inc(rq, sched_switch);
-		rq->active = rq->expired;
-		rq->expired = array;
-		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
-	}
+	if (unlikely(!array->nr_active))
+		array = switch_arrays(rq, MAX_PRIO);
 
 	idx = sched_find_first_bit(array->bitmap);
+get_next:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
+	/* very strict backgrounding */
+	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
+		int tmp = sched_find_first_bit(rq->expired->bitmap);
+
+		if (likely(tmp < idx)) {
+			array = switch_arrays(rq, idx);
+			idx = tmp;
+			goto get_next;
+		}
+	}
 
-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+	if (!rt_task(next) && interactive_sleep(next->sleep_type) && !bgnd_task(next)) {
 		unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
@@ -4052,7 +4169,8 @@ recheck:
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			policy != SCHED_BGND)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4063,8 +4181,8 @@ recheck:
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-					!= (param->sched_priority == 0))
+	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
+	     policy == SCHED_BGND) != (param->sched_priority == 0))
 		return -EINVAL;
 
 	/*
@@ -4072,15 +4190,20 @@ recheck:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		/*
-		 * can't change policy, except between SCHED_NORMAL
-		 * and SCHED_BATCH:
+		 * can't change policy, except between SCHED_NORMAL,
+		 * SCHED_BATCH or SCHED_BGND:
 		 */
-		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
+			    p->policy != SCHED_BGND) &&
+		     (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
+			    p->policy != SCHED_BGND) &&
+		     (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
+			    p->policy != SCHED_BATCH)) &&
 				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
 		/* can't increase priority */
-		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			    policy != SCHED_BGND) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority >
 				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
Index: MM-2.6.17-mm6/kernel/mutex.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
@@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c
 
 EXPORT_SYMBOL(__mutex_init);
 
+static inline void inc_mutex_count(void)
+{
+	current->mutexes_held++;
+}
+
+static inline void dec_mutex_count(void)
+{
+	current->mutexes_held--;
+}
+
 /*
  * We split the mutex lock/unlock logic into separate fastpath and
  * slowpath functions, to reduce the register pressure on the fastpath.
@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
 	 * 'unlocked' into 'locked' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	inc_mutex_count();
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
 	 * into 'unlocked' state:
 	 */
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+	dec_mutex_count();
 }
 
 EXPORT_SYMBOL(mutex_unlock);
@@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
  */
 int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+
+	if (likely(!ret))
+		inc_mutex_count();
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
  */
 int fastcall __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
+	int ret = __mutex_fastpath_trylock(&lock->count,
 					__mutex_trylock_slowpath);
+
+	if (likely(ret))
+		inc_mutex_count();
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
Index: MM-2.6.17-mm6/kernel/fork.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/fork.c	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/fork.c	2006-07-04 14:38:12.000000000 +1000
@@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
 	p->wchar = 0;		/* I/O counter: bytes written */
 	p->syscr = 0;		/* I/O counter: read syscalls */
 	p->syscw = 0;		/* I/O counter: write syscalls */
+	p->mutexes_held = 0;
 	acct_clear_integrals(p);
 
  	p->it_virt_expires = cputime_zero;

-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
 -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-04 23:35 [PATCH] sched: Add SCHED_BGND (background) scheduling policy Peter Williams
@ 2006-07-05  0:14 ` Con Kolivas
  2006-07-05  0:49   ` Peter Williams
  2006-07-05  0:44 ` Con Kolivas
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 21+ messages in thread
From: Con Kolivas @ 2006-07-05  0:14 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

On Wednesday 05 July 2006 09:35, Peter Williams wrote:
> Problem:
>
> There is a genuine need for the ability to put tasks in the background
> (a la the SCHED_IDLEPRIO policy in Con Kolivas's -sc kernels) as is
> evidenced by comments in LKML re a desire for SCHED_BATCH tasks
> to run completely in the background.
>
> Solution:
>
> Of course, one option would have been to just modify SCHED_BATCH so
> that tasks with that policy run completely in the background but there is a
> genuine need for a non background batch policy so the solution adopted
> is to implementa a new policy SCHED_BGND.
>
> SCHED_BATCH means that it's a normal process and should get a fair
> share of the CPU in accordance with its "nice" setting but it is NOT an
> interactive task and should NOT receive any of the special treatment
> that a task that is adjudged to be interactive receives.  In particular,
> it should always be moved to the expired array at the end of its time
> slice as to do otherwise might result in CPU starvation for other tasks.
>
> SCHED_BGND means it's totally unimportant and should only be given the
> CPU if no one else wants it OR if not giving it the CPU could lead to
> priority inversion or starvation of other tasks due to this tasks holding
> system resources.

Could we just call it SCHED_IDLEPRIO since it's the same thing and there are 
tools out there that already use this name?

-- 
-ck

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  0:14 ` Con Kolivas
@ 2006-07-05  0:49   ` Peter Williams
  2006-07-05  0:52     ` Con Kolivas
  2006-07-05  8:05     ` Andreas Mohr
  0 siblings, 2 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-05  0:49 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

Con Kolivas wrote:
> On Wednesday 05 July 2006 09:35, Peter Williams wrote:
>> Problem:
>>
>> There is a genuine need for the ability to put tasks in the background
>> (a la the SCHED_IDLEPRIO policy in Con Kolivas's -sc kernels) as is
>> evidenced by comments in LKML re a desire for SCHED_BATCH tasks
>> to run completely in the background.
>>
>> Solution:
>>
>> Of course, one option would have been to just modify SCHED_BATCH so
>> that tasks with that policy run completely in the background but there is a
>> genuine need for a non background batch policy so the solution adopted
>> is to implementa a new policy SCHED_BGND.
>>
>> SCHED_BATCH means that it's a normal process and should get a fair
>> share of the CPU in accordance with its "nice" setting but it is NOT an
>> interactive task and should NOT receive any of the special treatment
>> that a task that is adjudged to be interactive receives.  In particular,
>> it should always be moved to the expired array at the end of its time
>> slice as to do otherwise might result in CPU starvation for other tasks.
>>
>> SCHED_BGND means it's totally unimportant and should only be given the
>> CPU if no one else wants it OR if not giving it the CPU could lead to
>> priority inversion or starvation of other tasks due to this tasks holding
>> system resources.
> 
> Could we just call it SCHED_IDLEPRIO since it's the same thing and there are 
> tools out there that already use this name?
> 

I'm easy.  Which user space visible headers contain the definition? 
That's the only place that it matters.  When I was writing a program to 
use this feature, I couldn't find a header that defined any of the 
scheduler policies that was visible in user space (of course, that 
doesn't mean there isn't one - just that I couldn't find it).

Peter
PS Any programs that use SCHED_IDLEPRIO should work as long as its value 
is defined as 4.
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  0:49   ` Peter Williams
@ 2006-07-05  0:52     ` Con Kolivas
  2006-07-05  8:05     ` Andreas Mohr
  1 sibling, 0 replies; 21+ messages in thread
From: Con Kolivas @ 2006-07-05  0:52 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

On Wednesday 05 July 2006 10:49, Peter Williams wrote:
> Con Kolivas wrote:
> > Could we just call it SCHED_IDLEPRIO since it's the same thing and there
> > are tools out there that already use this name?
>
> I'm easy.  Which user space visible headers contain the definition?
> That's the only place that it matters.  When I was writing a program to
> use this feature, I couldn't find a header that defined any of the
> scheduler policies that was visible in user space (of course, that
> doesn't mean there isn't one - just that I couldn't find it).

Obviously nothing since this is out of tree stuff; it's hard coded into the 
apps themselves currently.

> Peter
> PS Any programs that use SCHED_IDLEPRIO should work as long as its value
> is defined as 4.

Aye I just figured not confusing terminology would be nice.

-- 
-ck

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  0:49   ` Peter Williams
  2006-07-05  0:52     ` Con Kolivas
@ 2006-07-05  8:05     ` Andreas Mohr
  2006-07-05 14:04       ` Jan Engelhardt
  1 sibling, 1 reply; 21+ messages in thread
From: Andreas Mohr @ 2006-07-05  8:05 UTC (permalink / raw)
  To: Peter Williams
  Cc: Con Kolivas, Andrew Morton, Nick Piggin, Linux Kernel,
	Ingo Molnar

On Wed, Jul 05, 2006 at 10:49:38AM +1000, Peter Williams wrote:
> Con Kolivas wrote:
> >On Wednesday 05 July 2006 09:35, Peter Williams wrote:
> >Could we just call it SCHED_IDLEPRIO since it's the same thing and there 
> >are tools out there that already use this name?

That makes quite some sense, seconded.

Plus, I like SCHED_IDLEPRIO more than SCHED_BGND, since it's more descriptive
when stuff happens and when not (but of course the SCHED_BGND name is shorter).

> I'm easy.  Which user space visible headers contain the definition? 
> That's the only place that it matters.  When I was writing a program to 
> use this feature, I couldn't find a header that defined any of the 
> scheduler policies that was visible in user space (of course, that 
> doesn't mean there isn't one - just that I couldn't find it).
> 
> Peter
> PS Any programs that use SCHED_IDLEPRIO should work as long as its value 
> is defined as 4.

OK, nice, but:

2.6.17-ck1:

/*
 * Scheduling policies
 */
#define SCHED_NORMAL            0
#define SCHED_FIFO              1
#define SCHED_RR                2
#define SCHED_BATCH             3
#define SCHED_ISO               4
#define SCHED_IDLEPRIO          5

#define SCHED_MIN               0
#define SCHED_MAX               5

Arggl.

So what does that tell us?

Does it tell us that the new policy should indeed be called SCHED_BGND
so that user-space programs that make use of a policy (e.g. schedtool
or any user-space app that wants to adjust its policy on its own) can figure
out which policy number to use (the 4 vs. 5 difference) by
#ifdef SCHED_IDLEPRIO elseif SCHED_BGND checks?

A less favourable solution would be to rename SCHED_BGND to SCHED_IDLEPRIO
but keep the current last policy number (4) for it, since that would
introduce a gross conflict (or do programs always go after the
policy name defines instead of the raw policy numbers?).
Given this issue, maybe best would be to add SCHED_IDLEPRIO *and* SCHED_ISO
to mainline at the same time, in order to keep the current
policy numbering extension as done in -ck.

Or maybe we should even introduce a more flexible way of dealing scheduling
policy registration and listing? A numbered solution that changes on a whim
whenever someone adds his own policy may be.... numbered ;) (in terms of days,
that is).

Semi-hard-coding scheduling numbers that get used by user-space sounds
somewhat hackish to me, maybe we should instead (or additionally?) have a
query API that returns a list of policy numbers for a given query input
(near-hard realtime features, non-root access, interactivity, cache usage
friendliness, batched operation, background processing).
What do other systems (*BSD, Solaris, ...) do in this case?

Andreas Mohr

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  8:05     ` Andreas Mohr
@ 2006-07-05 14:04       ` Jan Engelhardt
  0 siblings, 0 replies; 21+ messages in thread
From: Jan Engelhardt @ 2006-07-05 14:04 UTC (permalink / raw)
  To: Andreas Mohr
  Cc: Peter Williams, Con Kolivas, Andrew Morton, Nick Piggin,
	Linux Kernel, Ingo Molnar

>> Peter
>> PS Any programs that use SCHED_IDLEPRIO should work as long as its value 
>> is defined as 4.
>
>OK, nice, but:
>
>2.6.17-ck1:
>
>/*
> * Scheduling policies
> */
>#define SCHED_NORMAL            0
>#define SCHED_FIFO              1
>#define SCHED_RR                2
>#define SCHED_BATCH             3
>#define SCHED_ISO               4
>#define SCHED_IDLEPRIO          5
>
>#define SCHED_MIN               0
>#define SCHED_MAX               5
>
>
>Arggl.
>
>So what does that tell us?
>

We need a common header now.


Jan Engelhardt
-- 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-04 23:35 [PATCH] sched: Add SCHED_BGND (background) scheduling policy Peter Williams
  2006-07-05  0:14 ` Con Kolivas
@ 2006-07-05  0:44 ` Con Kolivas
  2006-07-05  1:15   ` Peter Williams
  2006-07-05  3:06   ` Peter Williams
  2006-07-05  6:35 ` Ingo Molnar
  2006-07-05 11:42 ` Mike Galbraith
  3 siblings, 2 replies; 21+ messages in thread
From: Con Kolivas @ 2006-07-05  0:44 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

some quick comments within code below.

On Wednesday 05 July 2006 09:35, Peter Williams wrote:
> ---
>  include/linux/init_task.h |    6 -
>  include/linux/sched.h     |   11 ++
>  kernel/fork.c             |    1
>  kernel/mutex.c            |   28 ++++++-
>  kernel/sched.c            |  183
> ++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 192
> insertions(+), 37 deletions(-)
>
> Index: MM-2.6.17-mm6/include/linux/init_task.h
> ===================================================================
> --- MM-2.6.17-mm6.orig/include/linux/init_task.h	2006-07-04
> 14:37:42.000000000 +1000 +++
> MM-2.6.17-mm6/include/linux/init_task.h	2006-07-04 14:38:12.000000000 +1000
> @@ -99,9 +99,9 @@ extern struct group_info init_groups;
>  	.usage		= ATOMIC_INIT(2),				\
>  	.flags		= 0,						\
>  	.lock_depth	= -1,						\
> -	.prio		= MAX_PRIO-20,					\
> -	.static_prio	= MAX_PRIO-20,					\
> -	.normal_prio	= MAX_PRIO-20,					\
> +	.prio		= MAX_RT_PRIO+20,				\
> +	.static_prio	= MAX_RT_PRIO+20,				\
> +	.normal_prio	= MAX_RT_PRIO+20,				\
>  	.policy		= SCHED_NORMAL,					\
>  	.cpus_allowed	= CPU_MASK_ALL,					\
>  	.mm		= NULL,						\
> Index: MM-2.6.17-mm6/include/linux/sched.h
> ===================================================================
> --- MM-2.6.17-mm6.orig/include/linux/sched.h	2006-07-04 14:37:43.000000000
> +1000 +++ MM-2.6.17-mm6/include/linux/sched.h	2006-07-04 14:38:12.000000000
> +1000 @@ -34,6 +34,8 @@
>  #define SCHED_FIFO		1
>  #define SCHED_RR		2
>  #define SCHED_BATCH		3
> +/* Scheduler class for background tasks */
> +#define SCHED_BGND		4
>
>  #ifdef __KERNEL__
>
> @@ -503,13 +505,16 @@ struct signal_struct {
>  #define MAX_USER_RT_PRIO	100
>  #define MAX_RT_PRIO		MAX_USER_RT_PRIO
>
> -#define MAX_PRIO		(MAX_RT_PRIO + 40)
> +#define BGND_PRIO		(MAX_RT_PRIO + 40)
> +/* add another slot for SCHED_BGND tasks */
> +#define MAX_PRIO		(BGND_PRIO + 1)
>
>  #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
>  #define rt_task(p)		rt_prio((p)->prio)
>  #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
>  #define has_rt_policy(p) \
> -	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
> +	unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)

idleprio tasks should be able to get rt_policy as well

> +#define bgnd_task(p)		(unlikely((p)->policy == SCHED_BGND))
>
>  /*
>   * Some day this will be a full-fledged user tracking system..
> @@ -810,6 +815,7 @@ struct task_struct {
>  	unsigned long sleep_avg;
>  	unsigned long long timestamp, last_ran;
>  	unsigned long long sched_time; /* sched_clock time spent running */
> +	unsigned int mutexes_held; /* for knowing when it's safe to repress
> SCHED_BGND tasks */ enum sleep_type sleep_type;
>
>  	unsigned long policy;
> @@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
>  #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
>  #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
>  #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset
> */ +#define PF_UIWAKE	0x08000000	/* Just woken from uninterrptible sleep */
> #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
>  #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex
> tester */
>
> Index: MM-2.6.17-mm6/kernel/sched.c
> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/sched.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/sched.c	2006-07-04 14:38:12.000000000 +1000
> @@ -59,7 +59,7 @@
>
>  /*
>   * Convert user-nice values [ -20 ... 0 ... 19 ]
> - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
> + * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
>   * and back.
>   */
>  #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
> @@ -73,7 +73,7 @@
>   */
>  #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
>  #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
> -#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
> +#define MAX_USER_PRIO		(USER_PRIO(BGND_PRIO))
>
>  /*
>   * Some helpers for converting nanosecond timing to jiffy resolution
> @@ -171,7 +171,7 @@
>   */
>
>  #define SCALE_PRIO(x, prio) \
> -	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
> +	max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
>
>  static unsigned int static_prio_timeslice(int static_prio)
>  {
> @@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
>  	return static_prio_timeslice(p->static_prio);
>  }
>
> +#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
> +#define safe_to_background(p) \
> +	(!((p)->mutexes_held || \
> +	   (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
> +
>  /*
>   * These are the runqueue data structures:
>   */
> @@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
>  {
>  	int bonus, prio;
>
> +	/* Ensure that background tasks stay at BGND_PRIO */
> +	if (bgnd_task(p) && safe_to_background(p))
> +		return BGND_PRIO;
> +
>  	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
>
>  	prio = p->static_prio - bonus;
>  	if (prio < MAX_RT_PRIO)
>  		prio = MAX_RT_PRIO;
> -	if (prio > MAX_PRIO-1)
> -		prio = MAX_PRIO-1;
> +	if (prio > BGND_PRIO-1)
> +		prio = BGND_PRIO-1;
>  	return prio;
>  }
>
> @@ -761,8 +770,18 @@ static void set_load_weight(struct task_
>  		else
>  #endif
>  			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
> -	} else
> -		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
> +	} else {
> +		/*
> +		 * Reduce the probability of a task escaping the background
> +		 * due to load balancing leaving it on a lighly used CPU
> +		 * Can't use zero as that would kill load balancing when only
> +		 * background tasks are running.
> +		 */
> +		if (bgnd_task(p))
> +			p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);

Why not just set it to 1 for all idleprio tasks? The granularity will be lost 
at anything lower anyway and it avoids a more complex calculation.

> +		else
> +			p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
> +	}
>  }
>
>  static inline void
> @@ -834,7 +853,10 @@ static void __activate_task(struct task_
>  {
>  	struct prio_array *target = rq->active;
>
> -	if (batch_task(p))
> +	/* Don't punish batch tasks just tasks actually in the background

An extra line here for multiline comments such as:
+	/*
+	 * Don't punish batch tasks just tasks actually in the background


> +	 * as anything else is counter productive from a system wide aspect
> +	 */
> +	if (task_in_background(p))
>  		target = rq->expired;
>  	enqueue_task(p, target);
>  	inc_nr_running(p, rq);
> @@ -942,6 +964,8 @@ static void activate_task(struct task_st
>  	if (!rt_task(p))
>  		p->prio = recalc_task_prio(p, now);
>
> +	p->flags &= ~PF_UIWAKE;
> +
>  	/*
>  	 * This checks to make sure it's not an uninterruptible task
>  	 * that is now waking up.
> @@ -1484,6 +1508,7 @@ out_activate:
>  		 * sleep_avg beyond just interactive state.
>  		 */
>  		p->sleep_type = SLEEP_NONINTERACTIVE;
> +		p->flags |= PF_UIWAKE;
>  	} else
>
>  	/*
> @@ -3024,6 +3049,48 @@ void scheduler_tick(void)
>  		}
>  		goto out_unlock;
>  	}
> +
> +	if (bgnd_task(p)) {
> +		/*
> +		 * Do this even if there's only one task on the queue as
> +		 * we want to set the priority low so that any waking tasks
> +		 * can preempt.
> +		 */
> +		if (task_in_background(p)) {
> +			/*
> +			 * Tasks currently in the background will be
> +			 * at BGND_PRIO priority and preemption
> +			 * should be enough to keep them in check provided we
> +			 * don't let them adversely effect tasks on the expired

ok I'm going to risk a lart and say "affect" ?

> +			 * array.
> +			 */
> +			if (!safe_to_background(p)) {
> +				dequeue_task(p, rq->active);
> +				p->prio = effective_prio(p);
> +				enqueue_task(p, rq->active);
> +			} else if (rq->expired->nr_active &&
> +				   rq->best_expired_prio < p->prio) {
> +				dequeue_task(p, rq->active);
> +				enqueue_task(p, rq->expired);
> +				set_tsk_need_resched(p);
> +				goto out_unlock;
> +			}
> +		}
> +		else if (safe_to_background(p)) {
> +			dequeue_task(p, rq->active);
> +			p->normal_prio = BGND_PRIO;
> +			/* this should be safe for PI purposes */
> +			p->prio = p->normal_prio;
> +			enqueue_task(p, rq->expired);
> +			/*
> +			 * think about making this conditional to reduce
> +			 * context switch rate
> +			 */
> +			set_tsk_need_resched(p);
> +			goto out_unlock;
> +		}
> +	}
> +
>  	if (!--p->time_slice) {
>  		dequeue_task(p, rq->active);
>  		set_tsk_need_resched(p);
> @@ -3033,6 +3100,11 @@ void scheduler_tick(void)
>
>  		if (!rq->expired_timestamp)
>  			rq->expired_timestamp = jiffies;
> +		/*
> +		 * No need to do anything special for background tasks here
> +		 * as TASK_INTERACTIVE() should fail when they're in the
> +		 * background.
> +		 */
>  		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
>  			enqueue_task(p, rq->expired);
>  			if (p->static_prio < rq->best_expired_prio)
> @@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct
>  }
>
>  /*
> + * task time slice for SMT dependent idle purposes
> + */
> +static unsigned int smt_timeslice(struct task_struct *p)
> +{
> +	if (task_in_background(p))
> +		return 0;
> +
> +	return task_timeslice(p);
> +}
> +
> +/*
> + * Is the thisp a higher priority task than thatp for SMT dependent idle
> + * purposes?
> + */
> +static int task_priority_gt(const struct task_struct *thisp,
> +			    const struct task_struct *thatp)
> +{
> +	if (task_in_background(thisp))
> +	    return !task_in_background(thatp);
> +
> +	if (task_in_background(thatp))
> +	    return 1;
> +
> +	return thisp->static_prio < thatp->static_prio;
> +}
> +
> +/*
>   * To minimise lock contention and not have to drop this_rq's runlock we
> only * trylock the sibling runqueues and bypass those runqueues if we fail
> to * acquire their lock. As we only trylock the normal locking order does
> not @@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
>  				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
>  					ret = 1;
>  		} else {
> -			if (smt_curr->static_prio < p->static_prio &&
> +			if (task_priority_gt(smt_curr, p) &&
>  				!TASK_PREEMPTS_CURR(p, smt_rq) &&
> -				smt_slice(smt_curr, sd) > task_timeslice(p))
> +				smt_slice(smt_curr, sd) > smt_timeslice(p))
>  					ret = 1;
>  		}
>  unlock:
> @@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
>  }
>
>  /*
> + * Switch the active and expired arrays.
> + */
> +static struct prio_array *switch_arrays(struct rq *rq, int
> best_active_prio) +{

In the fast path this should be inlined even if it is large.

> +	struct prio_array *array = rq->active;
> +
> +	schedstat_inc(rq, sched_switch);
> +	rq->active = rq->expired;
> +	rq->expired = array;
> +	rq->expired_timestamp = 0;
> +	rq->best_expired_prio = best_active_prio;
> +
> +	return rq->active;
> +}
> +
> +/*
>   * schedule() is the main scheduler function.
>   */
>  asmlinkage void __sched schedule(void)
> @@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
>  	}
>
>  	array = rq->active;
> -	if (unlikely(!array->nr_active)) {
> -		/*
> -		 * Switch the active and expired arrays.
> -		 */
> -		schedstat_inc(rq, sched_switch);
> -		rq->active = rq->expired;
> -		rq->expired = array;
> -		array = rq->active;
> -		rq->expired_timestamp = 0;
> -		rq->best_expired_prio = MAX_PRIO;
> -	}
> +	if (unlikely(!array->nr_active))
> +		array = switch_arrays(rq, MAX_PRIO);
>
>  	idx = sched_find_first_bit(array->bitmap);
> +get_next:
>  	queue = array->queue + idx;
>  	next = list_entry(queue->next, struct task_struct, run_list);
> +	/* very strict backgrounding */
> +	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
> +		int tmp = sched_find_first_bit(rq->expired->bitmap);
> +
> +		if (likely(tmp < idx)) {
> +			array = switch_arrays(rq, idx);
> +			idx = tmp;
> +			goto get_next;
> +		}
> +	}
>
> -	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
> +	if (!rt_task(next) && interactive_sleep(next->sleep_type) &&
> !bgnd_task(next)) { unsigned long long delta = now - next->timestamp;
>  		if (unlikely((long long)(now - next->timestamp) < 0))
>  			delta = 0;
> @@ -4052,7 +4169,8 @@ recheck:
>  	if (policy < 0)
>  		policy = oldpolicy = p->policy;
>  	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
> -			policy != SCHED_NORMAL && policy != SCHED_BATCH)
> +			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
> +			policy != SCHED_BGND)

how about a wrapper for all these policies? (see my sched_range patch)

>  		return -EINVAL;
>  	/*
>  	 * Valid priorities for SCHED_FIFO and SCHED_RR are
> @@ -4063,8 +4181,8 @@ recheck:
>  	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
>  	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
>  		return -EINVAL;
> -	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
> -					!= (param->sched_priority == 0))
> +	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
> +	     policy == SCHED_BGND) != (param->sched_priority == 0))
>  		return -EINVAL;

same

>  	/*
> @@ -4072,15 +4190,20 @@ recheck:
>  	 */
>  	if (!capable(CAP_SYS_NICE)) {
>  		/*
> -		 * can't change policy, except between SCHED_NORMAL
> -		 * and SCHED_BATCH:
> +		 * can't change policy, except between SCHED_NORMAL,
> +		 * SCHED_BATCH or SCHED_BGND:
>  		 */
> -		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
> -			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
> +		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
> +			    p->policy != SCHED_BGND) &&
> +		     (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
> +			    p->policy != SCHED_BGND) &&
> +		     (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
> +			    p->policy != SCHED_BATCH)) &&

same

>  				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
>  			return -EPERM;
>  		/* can't increase priority */
> -		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
> +		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
> +			    policy != SCHED_BGND) &&
>  		    param->sched_priority > p->rt_priority &&
>  		    param->sched_priority >
>  				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
> Index: MM-2.6.17-mm6/kernel/mutex.c
> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
> @@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c
>
>  EXPORT_SYMBOL(__mutex_init);
>
> +static inline void inc_mutex_count(void)
> +{
> +	current->mutexes_held++;
> +}
> +
> +static inline void dec_mutex_count(void)
> +{
> +	current->mutexes_held--;
> +}
> +
>  /*
>   * We split the mutex lock/unlock logic into separate fastpath and
>   * slowpath functions, to reduce the register pressure on the fastpath.
> @@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
>  	 * 'unlocked' into 'locked' state.
>  	 */
>  	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
> +	inc_mutex_count();
>  }
>
>  EXPORT_SYMBOL(mutex_lock);
> @@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
>  	 * into 'unlocked' state:
>  	 */
>  	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
> +	dec_mutex_count();
>  }
>
>  EXPORT_SYMBOL(mutex_unlock);
> @@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
>   */
>  int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
>  {
> +	int ret;
> +
>  	might_sleep();
> -	return __mutex_fastpath_lock_retval
> +	ret = __mutex_fastpath_lock_retval
>  			(&lock->count, __mutex_lock_interruptible_slowpath);
> +
> +	if (likely(!ret))
> +		inc_mutex_count();
> +
> +	return ret;
>  }
>
>  EXPORT_SYMBOL(mutex_lock_interruptible);
> @@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
>   */
>  int fastcall __sched mutex_trylock(struct mutex *lock)
>  {
> -	return __mutex_fastpath_trylock(&lock->count,
> +	int ret = __mutex_fastpath_trylock(&lock->count,
>  					__mutex_trylock_slowpath);
> +
> +	if (likely(ret))
> +		inc_mutex_count();
> +
> +	return ret;
>  }
>
>  EXPORT_SYMBOL(mutex_trylock);
> Index: MM-2.6.17-mm6/kernel/fork.c
> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/fork.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/fork.c	2006-07-04 14:38:12.000000000 +1000
> @@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
>  	p->wchar = 0;		/* I/O counter: bytes written */
>  	p->syscr = 0;		/* I/O counter: read syscalls */
>  	p->syscw = 0;		/* I/O counter: write syscalls */
> +	p->mutexes_held = 0;
>  	acct_clear_integrals(p);
>
>   	p->it_virt_expires = cputime_zero;

-- 
-ck

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  0:44 ` Con Kolivas
@ 2006-07-05  1:15   ` Peter Williams
  2006-07-05  1:33     ` Con Kolivas
  2006-07-05  3:06   ` Peter Williams
  1 sibling, 1 reply; 21+ messages in thread
From: Peter Williams @ 2006-07-05  1:15 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

Con Kolivas wrote:
> some quick comments within code below.
> 
> On Wednesday 05 July 2006 09:35, Peter Williams wrote:
>> ---
>>  include/linux/init_task.h |    6 -
>>  include/linux/sched.h     |   11 ++
>>  kernel/fork.c             |    1
>>  kernel/mutex.c            |   28 ++++++-
>>  kernel/sched.c            |  183
>> ++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 192
>> insertions(+), 37 deletions(-)
>>
>> Index: MM-2.6.17-mm6/include/linux/init_task.h
>> ===================================================================
>> --- MM-2.6.17-mm6.orig/include/linux/init_task.h	2006-07-04
>> 14:37:42.000000000 +1000 +++
>> MM-2.6.17-mm6/include/linux/init_task.h	2006-07-04 14:38:12.000000000 +1000
>> @@ -99,9 +99,9 @@ extern struct group_info init_groups;
>>  	.usage		= ATOMIC_INIT(2),				\
>>  	.flags		= 0,						\
>>  	.lock_depth	= -1,						\
>> -	.prio		= MAX_PRIO-20,					\
>> -	.static_prio	= MAX_PRIO-20,					\
>> -	.normal_prio	= MAX_PRIO-20,					\
>> +	.prio		= MAX_RT_PRIO+20,				\
>> +	.static_prio	= MAX_RT_PRIO+20,				\
>> +	.normal_prio	= MAX_RT_PRIO+20,				\
>>  	.policy		= SCHED_NORMAL,					\
>>  	.cpus_allowed	= CPU_MASK_ALL,					\
>>  	.mm		= NULL,						\
>> Index: MM-2.6.17-mm6/include/linux/sched.h
>> ===================================================================
>> --- MM-2.6.17-mm6.orig/include/linux/sched.h	2006-07-04 14:37:43.000000000
>> +1000 +++ MM-2.6.17-mm6/include/linux/sched.h	2006-07-04 14:38:12.000000000
>> +1000 @@ -34,6 +34,8 @@
>>  #define SCHED_FIFO		1
>>  #define SCHED_RR		2
>>  #define SCHED_BATCH		3
>> +/* Scheduler class for background tasks */
>> +#define SCHED_BGND		4
>>
>>  #ifdef __KERNEL__
>>
>> @@ -503,13 +505,16 @@ struct signal_struct {
>>  #define MAX_USER_RT_PRIO	100
>>  #define MAX_RT_PRIO		MAX_USER_RT_PRIO
>>
>> -#define MAX_PRIO		(MAX_RT_PRIO + 40)
>> +#define BGND_PRIO		(MAX_RT_PRIO + 40)
>> +/* add another slot for SCHED_BGND tasks */
>> +#define MAX_PRIO		(BGND_PRIO + 1)
>>
>>  #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
>>  #define rt_task(p)		rt_prio((p)->prio)
>>  #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
>>  #define has_rt_policy(p) \
>> -	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
>> +	unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)
> 
> idleprio tasks should be able to get rt_policy as well

I don't understand what you mean here.  A task can only have one 
scheduling policy.  The simple (direct) definition of has_rt_policy() is 
(p->policy == SCHED_FIFO || p->policy == SCHED_RR) and the one defined 
is just a rearrangement of that with a view to minimizing overhead in 
the majority of invocations.

> 
>> +#define bgnd_task(p)		(unlikely((p)->policy == SCHED_BGND))
>>
>>  /*
>>   * Some day this will be a full-fledged user tracking system..
>> @@ -810,6 +815,7 @@ struct task_struct {
>>  	unsigned long sleep_avg;
>>  	unsigned long long timestamp, last_ran;
>>  	unsigned long long sched_time; /* sched_clock time spent running */
>> +	unsigned int mutexes_held; /* for knowing when it's safe to repress
>> SCHED_BGND tasks */ enum sleep_type sleep_type;
>>
>>  	unsigned long policy;
>> @@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
>>  #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
>>  #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
>>  #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset
>> */ +#define PF_UIWAKE	0x08000000	/* Just woken from uninterrptible sleep */
>> #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
>>  #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex
>> tester */
>>
>> Index: MM-2.6.17-mm6/kernel/sched.c
>> ===================================================================
>> --- MM-2.6.17-mm6.orig/kernel/sched.c	2006-07-04 14:37:43.000000000 +1000
>> +++ MM-2.6.17-mm6/kernel/sched.c	2006-07-04 14:38:12.000000000 +1000
>> @@ -59,7 +59,7 @@
>>
>>  /*
>>   * Convert user-nice values [ -20 ... 0 ... 19 ]
>> - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
>> + * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
>>   * and back.
>>   */
>>  #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
>> @@ -73,7 +73,7 @@
>>   */
>>  #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
>>  #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
>> -#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
>> +#define MAX_USER_PRIO		(USER_PRIO(BGND_PRIO))
>>
>>  /*
>>   * Some helpers for converting nanosecond timing to jiffy resolution
>> @@ -171,7 +171,7 @@
>>   */
>>
>>  #define SCALE_PRIO(x, prio) \
>> -	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
>> +	max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
>>
>>  static unsigned int static_prio_timeslice(int static_prio)
>>  {
>> @@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
>>  	return static_prio_timeslice(p->static_prio);
>>  }
>>
>> +#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
>> +#define safe_to_background(p) \
>> +	(!((p)->mutexes_held || \
>> +	   (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
>> +
>>  /*
>>   * These are the runqueue data structures:
>>   */
>> @@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
>>  {
>>  	int bonus, prio;
>>
>> +	/* Ensure that background tasks stay at BGND_PRIO */
>> +	if (bgnd_task(p) && safe_to_background(p))
>> +		return BGND_PRIO;
>> +
>>  	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
>>
>>  	prio = p->static_prio - bonus;
>>  	if (prio < MAX_RT_PRIO)
>>  		prio = MAX_RT_PRIO;
>> -	if (prio > MAX_PRIO-1)
>> -		prio = MAX_PRIO-1;
>> +	if (prio > BGND_PRIO-1)
>> +		prio = BGND_PRIO-1;
>>  	return prio;
>>  }
>>
>> @@ -761,8 +770,18 @@ static void set_load_weight(struct task_
>>  		else
>>  #endif
>>  			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
>> -	} else
>> -		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
>> +	} else {
>> +		/*
>> +		 * Reduce the probability of a task escaping the background
>> +		 * due to load balancing leaving it on a lighly used CPU
>> +		 * Can't use zero as that would kill load balancing when only
>> +		 * background tasks are running.
>> +		 */
>> +		if (bgnd_task(p))
>> +			p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);
> 
> Why not just set it to 1 for all idleprio tasks? The granularity will be lost 
> at anything lower anyway and it avoids a more complex calculation.
> 
>> +		else
>> +			p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
>> +	}
>>  }
>>
>>  static inline void
>> @@ -834,7 +853,10 @@ static void __activate_task(struct task_
>>  {
>>  	struct prio_array *target = rq->active;
>>
>> -	if (batch_task(p))
>> +	/* Don't punish batch tasks just tasks actually in the background
> 
> An extra line here for multiline comments such as:
> +	/*
> +	 * Don't punish batch tasks just tasks actually in the background
> 
> 
>> +	 * as anything else is counter productive from a system wide aspect
>> +	 */
>> +	if (task_in_background(p))
>>  		target = rq->expired;
>>  	enqueue_task(p, target);
>>  	inc_nr_running(p, rq);
>> @@ -942,6 +964,8 @@ static void activate_task(struct task_st
>>  	if (!rt_task(p))
>>  		p->prio = recalc_task_prio(p, now);
>>
>> +	p->flags &= ~PF_UIWAKE;
>> +
>>  	/*
>>  	 * This checks to make sure it's not an uninterruptible task
>>  	 * that is now waking up.
>> @@ -1484,6 +1508,7 @@ out_activate:
>>  		 * sleep_avg beyond just interactive state.
>>  		 */
>>  		p->sleep_type = SLEEP_NONINTERACTIVE;
>> +		p->flags |= PF_UIWAKE;
>>  	} else
>>
>>  	/*
>> @@ -3024,6 +3049,48 @@ void scheduler_tick(void)
>>  		}
>>  		goto out_unlock;
>>  	}
>> +
>> +	if (bgnd_task(p)) {
>> +		/*
>> +		 * Do this even if there's only one task on the queue as
>> +		 * we want to set the priority low so that any waking tasks
>> +		 * can preempt.
>> +		 */
>> +		if (task_in_background(p)) {
>> +			/*
>> +			 * Tasks currently in the background will be
>> +			 * at BGND_PRIO priority and preemption
>> +			 * should be enough to keep them in check provided we
>> +			 * don't let them adversely effect tasks on the expired
> 
> ok I'm going to risk a lart and say "affect" ?

I have to refer you to the Oxford English Dictionary.  According to it 
(when used as a verb):

affect:  1. like, love 2. like to use, practice or wear 3. aim at, seek 
4. use or display ostentatiously 5. assume a false appearance 6. attack 
as a disease 7. move or touch.

effect:  1. bring about (an event or result) 2. produce (a state or 
condition) 3. make, construct or build

> 
>> +			 * array.
>> +			 */
>> +			if (!safe_to_background(p)) {
>> +				dequeue_task(p, rq->active);
>> +				p->prio = effective_prio(p);
>> +				enqueue_task(p, rq->active);
>> +			} else if (rq->expired->nr_active &&
>> +				   rq->best_expired_prio < p->prio) {
>> +				dequeue_task(p, rq->active);
>> +				enqueue_task(p, rq->expired);
>> +				set_tsk_need_resched(p);
>> +				goto out_unlock;
>> +			}
>> +		}
>> +		else if (safe_to_background(p)) {
>> +			dequeue_task(p, rq->active);
>> +			p->normal_prio = BGND_PRIO;
>> +			/* this should be safe for PI purposes */
>> +			p->prio = p->normal_prio;
>> +			enqueue_task(p, rq->expired);
>> +			/*
>> +			 * think about making this conditional to reduce
>> +			 * context switch rate
>> +			 */
>> +			set_tsk_need_resched(p);
>> +			goto out_unlock;
>> +		}
>> +	}
>> +
>>  	if (!--p->time_slice) {
>>  		dequeue_task(p, rq->active);
>>  		set_tsk_need_resched(p);
>> @@ -3033,6 +3100,11 @@ void scheduler_tick(void)
>>
>>  		if (!rq->expired_timestamp)
>>  			rq->expired_timestamp = jiffies;
>> +		/*
>> +		 * No need to do anything special for background tasks here
>> +		 * as TASK_INTERACTIVE() should fail when they're in the
>> +		 * background.
>> +		 */
>>  		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
>>  			enqueue_task(p, rq->expired);
>>  			if (p->static_prio < rq->best_expired_prio)
>> @@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct
>>  }
>>
>>  /*
>> + * task time slice for SMT dependent idle purposes
>> + */
>> +static unsigned int smt_timeslice(struct task_struct *p)
>> +{
>> +	if (task_in_background(p))
>> +		return 0;
>> +
>> +	return task_timeslice(p);
>> +}
>> +
>> +/*
>> + * Is the thisp a higher priority task than thatp for SMT dependent idle
>> + * purposes?
>> + */
>> +static int task_priority_gt(const struct task_struct *thisp,
>> +			    const struct task_struct *thatp)
>> +{
>> +	if (task_in_background(thisp))
>> +	    return !task_in_background(thatp);
>> +
>> +	if (task_in_background(thatp))
>> +	    return 1;
>> +
>> +	return thisp->static_prio < thatp->static_prio;
>> +}
>> +
>> +/*
>>   * To minimise lock contention and not have to drop this_rq's runlock we
>> only * trylock the sibling runqueues and bypass those runqueues if we fail
>> to * acquire their lock. As we only trylock the normal locking order does
>> not @@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
>>  				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
>>  					ret = 1;
>>  		} else {
>> -			if (smt_curr->static_prio < p->static_prio &&
>> +			if (task_priority_gt(smt_curr, p) &&
>>  				!TASK_PREEMPTS_CURR(p, smt_rq) &&
>> -				smt_slice(smt_curr, sd) > task_timeslice(p))
>> +				smt_slice(smt_curr, sd) > smt_timeslice(p))
>>  					ret = 1;
>>  		}
>>  unlock:
>> @@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
>>  }
>>
>>  /*
>> + * Switch the active and expired arrays.
>> + */
>> +static struct prio_array *switch_arrays(struct rq *rq, int
>> best_active_prio) +{
> 
> In the fast path this should be inlined even if it is large.

OK.

> 
>> +	struct prio_array *array = rq->active;
>> +
>> +	schedstat_inc(rq, sched_switch);
>> +	rq->active = rq->expired;
>> +	rq->expired = array;
>> +	rq->expired_timestamp = 0;
>> +	rq->best_expired_prio = best_active_prio;
>> +
>> +	return rq->active;
>> +}
>> +
>> +/*
>>   * schedule() is the main scheduler function.
>>   */
>>  asmlinkage void __sched schedule(void)
>> @@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
>>  	}
>>
>>  	array = rq->active;
>> -	if (unlikely(!array->nr_active)) {
>> -		/*
>> -		 * Switch the active and expired arrays.
>> -		 */
>> -		schedstat_inc(rq, sched_switch);
>> -		rq->active = rq->expired;
>> -		rq->expired = array;
>> -		array = rq->active;
>> -		rq->expired_timestamp = 0;
>> -		rq->best_expired_prio = MAX_PRIO;
>> -	}
>> +	if (unlikely(!array->nr_active))
>> +		array = switch_arrays(rq, MAX_PRIO);
>>
>>  	idx = sched_find_first_bit(array->bitmap);
>> +get_next:
>>  	queue = array->queue + idx;
>>  	next = list_entry(queue->next, struct task_struct, run_list);
>> +	/* very strict backgrounding */
>> +	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
>> +		int tmp = sched_find_first_bit(rq->expired->bitmap);
>> +
>> +		if (likely(tmp < idx)) {
>> +			array = switch_arrays(rq, idx);
>> +			idx = tmp;
>> +			goto get_next;
>> +		}
>> +	}
>>
>> -	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
>> +	if (!rt_task(next) && interactive_sleep(next->sleep_type) &&
>> !bgnd_task(next)) { unsigned long long delta = now - next->timestamp;
>>  		if (unlikely((long long)(now - next->timestamp) < 0))
>>  			delta = 0;
>> @@ -4052,7 +4169,8 @@ recheck:
>>  	if (policy < 0)
>>  		policy = oldpolicy = p->policy;
>>  	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
>> -			policy != SCHED_NORMAL && policy != SCHED_BATCH)
>> +			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
>> +			policy != SCHED_BGND)
> 
> how about a wrapper for all these policies? (see my sched_range patch)

I must admit that when I was doing this I wished it was less messy.  But 
I figured that it was best to do it in a way that was easy to show as 
being correct and then do simplification later.  Especially, as 
simplification would also effect the other policies.

> 
>>  		return -EINVAL;
>>  	/*
>>  	 * Valid priorities for SCHED_FIFO and SCHED_RR are
>> @@ -4063,8 +4181,8 @@ recheck:
>>  	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
>>  	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
>>  		return -EINVAL;
>> -	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
>> -					!= (param->sched_priority == 0))
>> +	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
>> +	     policy == SCHED_BGND) != (param->sched_priority == 0))
>>  		return -EINVAL;
> 
> same
> 
>>  	/*
>> @@ -4072,15 +4190,20 @@ recheck:
>>  	 */
>>  	if (!capable(CAP_SYS_NICE)) {
>>  		/*
>> -		 * can't change policy, except between SCHED_NORMAL
>> -		 * and SCHED_BATCH:
>> +		 * can't change policy, except between SCHED_NORMAL,
>> +		 * SCHED_BATCH or SCHED_BGND:
>>  		 */
>> -		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
>> -			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
>> +		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
>> +			    p->policy != SCHED_BGND) &&
>> +		     (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
>> +			    p->policy != SCHED_BGND) &&
>> +		     (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
>> +			    p->policy != SCHED_BATCH)) &&
> 
> same
> 
>>  				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
>>  			return -EPERM;
>>  		/* can't increase priority */
>> -		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
>> +		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
>> +			    policy != SCHED_BGND) &&
>>  		    param->sched_priority > p->rt_priority &&
>>  		    param->sched_priority >
>>  				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
>> Index: MM-2.6.17-mm6/kernel/mutex.c
>> ===================================================================
>> --- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
>> +++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
>> @@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c
>>
>>  EXPORT_SYMBOL(__mutex_init);
>>
>> +static inline void inc_mutex_count(void)
>> +{
>> +	current->mutexes_held++;
>> +}
>> +
>> +static inline void dec_mutex_count(void)
>> +{
>> +	current->mutexes_held--;
>> +}
>> +
>>  /*
>>   * We split the mutex lock/unlock logic into separate fastpath and
>>   * slowpath functions, to reduce the register pressure on the fastpath.
>> @@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
>>  	 * 'unlocked' into 'locked' state.
>>  	 */
>>  	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
>> +	inc_mutex_count();
>>  }
>>
>>  EXPORT_SYMBOL(mutex_lock);
>> @@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
>>  	 * into 'unlocked' state:
>>  	 */
>>  	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
>> +	dec_mutex_count();
>>  }
>>
>>  EXPORT_SYMBOL(mutex_unlock);
>> @@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
>>   */
>>  int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
>>  {
>> +	int ret;
>> +
>>  	might_sleep();
>> -	return __mutex_fastpath_lock_retval
>> +	ret = __mutex_fastpath_lock_retval
>>  			(&lock->count, __mutex_lock_interruptible_slowpath);
>> +
>> +	if (likely(!ret))
>> +		inc_mutex_count();
>> +
>> +	return ret;
>>  }
>>
>>  EXPORT_SYMBOL(mutex_lock_interruptible);
>> @@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
>>   */
>>  int fastcall __sched mutex_trylock(struct mutex *lock)
>>  {
>> -	return __mutex_fastpath_trylock(&lock->count,
>> +	int ret = __mutex_fastpath_trylock(&lock->count,
>>  					__mutex_trylock_slowpath);
>> +
>> +	if (likely(ret))
>> +		inc_mutex_count();
>> +
>> +	return ret;
>>  }
>>
>>  EXPORT_SYMBOL(mutex_trylock);
>> Index: MM-2.6.17-mm6/kernel/fork.c
>> ===================================================================
>> --- MM-2.6.17-mm6.orig/kernel/fork.c	2006-07-04 14:37:43.000000000 +1000
>> +++ MM-2.6.17-mm6/kernel/fork.c	2006-07-04 14:38:12.000000000 +1000
>> @@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
>>  	p->wchar = 0;		/* I/O counter: bytes written */
>>  	p->syscr = 0;		/* I/O counter: read syscalls */
>>  	p->syscw = 0;		/* I/O counter: write syscalls */
>> +	p->mutexes_held = 0;
>>  	acct_clear_integrals(p);
>>
>>   	p->it_virt_expires = cputime_zero;
> 


-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  1:15   ` Peter Williams
@ 2006-07-05  1:33     ` Con Kolivas
  2006-07-05  4:20       ` Valdis.Kletnieks
  0 siblings, 1 reply; 21+ messages in thread
From: Con Kolivas @ 2006-07-05  1:33 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

On Wednesday 05 July 2006 11:15, Peter Williams wrote:
> Con Kolivas wrote:
> > some quick comments within code below.
> >
> > On Wednesday 05 July 2006 09:35, Peter Williams wrote:
> >> -	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
> >> +	unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)
> >
> > idleprio tasks should be able to get rt_policy as well
>
> I don't understand what you mean here.  A task can only have one
> scheduling policy.  The simple (direct) definition of has_rt_policy() is
> (p->policy == SCHED_FIFO || p->policy == SCHED_RR) and the one defined
> is just a rearrangement of that with a view to minimizing overhead in
> the majority of invocations.

I meant they could get rt priority. This does look correct, sorry.

> >> +			 * Tasks currently in the background will be
> >> +			 * at BGND_PRIO priority and preemption
> >> +			 * should be enough to keep them in check provided we
> >> +			 * don't let them adversely effect tasks on the expired
> >
> > ok I'm going to risk a lart and say "affect" ?
>
> I have to refer you to the Oxford English Dictionary.

I was hoping you would.

> According to it 
> (when used as a verb):
>
> affect:  1. like, love 2. like to use, practice or wear 3. aim at, seek
> 4. use or display ostentatiously 5. assume a false appearance 6. attack
> as a disease 7. move or touch.
>
> effect:  1. bring about (an event or result) 2. produce (a state or
> condition) 3. make, construct or build

Let's take this discussion offlist for my benefit, as I'd like to nut this 
out. I still see it as affect with those definitions.

-- 
-ck

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  1:33     ` Con Kolivas
@ 2006-07-05  4:20       ` Valdis.Kletnieks
  0 siblings, 0 replies; 21+ messages in thread
From: Valdis.Kletnieks @ 2006-07-05  4:20 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Peter Williams, Andrew Morton, Nick Piggin, Linux Kernel,
	Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 1172 bytes --]

On Wed, 05 Jul 2006 11:33:42 +1000, Con Kolivas said:
> On Wednesday 05 July 2006 11:15, Peter Williams wrote:
> > Con Kolivas wrote:

> > >> +			 * don't let them adversely effect tasks on the expired
> > >
> > > ok I'm going to risk a lart and say "affect" ?

No, that would be correct English.

> > I have to refer you to the Oxford English Dictionary.

Actually, something like Strunk&White's "Elements of Style" is better suited
to this sort of thing than the OED.  The OED just lists *words*, not how to
put them together.

http://www.amazon.com/gp/product/020530902X/qid=1152072733/sr=1-1/ref=sr_1_1/002-2430423-3716834?s=books&v=glance&n=283155

> I was hoping you would.

They would *affect* tasks. The *effect* of this would be...

Note that 'effect' can be a verb too, but in that sense it refers to
a "facilitator" - "The mayor effected change in policy" meaning that he
made it happen. So in the kernel, "A effects B" is only correct if A
is code that is intended to make B happen.  If A, through blind all-elbows
coding, happens to cause B to change, then "A affects B" is proper.

Clear as mud? :)  If not, read the Strunk&White explanation of this one. :)

[-- Attachment #2: Type: application/pgp-signature, Size: 226 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  0:44 ` Con Kolivas
  2006-07-05  1:15   ` Peter Williams
@ 2006-07-05  3:06   ` Peter Williams
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-05  3:06 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Ingo Molnar

Con Kolivas wrote:
> some quick comments within code below.
> 
> On Wednesday 05 July 2006 09:35, Peter Williams wrote:
>> @@ -761,8 +770,18 @@ static void set_load_weight(struct task_
>>  		else
>>  #endif
>>  			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
>> -	} else
>> -		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
>> +	} else {
>> +		/*
>> +		 * Reduce the probability of a task escaping the background
>> +		 * due to load balancing leaving it on a lighly used CPU
>> +		 * Can't use zero as that would kill load balancing when only
>> +		 * background tasks are running.
>> +		 */
>> +		if (bgnd_task(p))
>> +			p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);
> 
> Why not just set it to 1 for all idleprio tasks? The granularity will be lost 
> at anything lower anyway and it avoids a more complex calculation.

I missed this one in my previous reply.  I agree, what you say makes 
sense.  I was in my "think too hard" mode and probably thinking 
(unnecessarily) about how it might effect the smoothed load calculations.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-04 23:35 [PATCH] sched: Add SCHED_BGND (background) scheduling policy Peter Williams
  2006-07-05  0:14 ` Con Kolivas
  2006-07-05  0:44 ` Con Kolivas
@ 2006-07-05  6:35 ` Ingo Molnar
  2006-07-05  8:03   ` Peter Williams
  2006-07-05 11:42 ` Mike Galbraith
  3 siblings, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2006-07-05  6:35 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas


* Peter Williams <pwil3058@bigpond.net.au> wrote:

> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
> @@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c
>  
>  EXPORT_SYMBOL(__mutex_init);
>  
> +static inline void inc_mutex_count(void)
> +{
> +	current->mutexes_held++;
> +}
> +
> +static inline void dec_mutex_count(void)
> +{
> +	current->mutexes_held--;
> +}
> +

NACK! This whole patch is way too intrusive for such a relatively small 
gain.

also, if something doesnt hold a mutex, it might still be unsafe to 
background it! For example if it holds a semaphore. Or an rwsem. Or any 
other kernel resource that has exclusion semantics.

so unless this patch gets _much_ less complex and much less intrusive, 
we'll have to stay with SCHED_BATCH and nice +19.

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  6:35 ` Ingo Molnar
@ 2006-07-05  8:03   ` Peter Williams
  2006-07-05  8:15     ` Arjan van de Ven
  2006-07-05  8:19     ` Ingo Molnar
  0 siblings, 2 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-05  8:03 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas

Ingo Molnar wrote:
> * Peter Williams <pwil3058@bigpond.net.au> wrote:
> 
>> ===================================================================
>> --- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
>> +++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
>> @@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c
>>  
>>  EXPORT_SYMBOL(__mutex_init);
>>  
>> +static inline void inc_mutex_count(void)
>> +{
>> +	current->mutexes_held++;
>> +}
>> +
>> +static inline void dec_mutex_count(void)
>> +{
>> +	current->mutexes_held--;
>> +}
>> +
> 
> NACK! This whole patch is way too intrusive for such a relatively small 
> gain.
> 
> also, if something doesnt hold a mutex, it might still be unsafe to 
> background it! For example if it holds a semaphore. Or an rwsem. Or any 
> other kernel resource that has exclusion semantics.
> 
> so unless this patch gets _much_ less complex and much less intrusive, 
> we'll have to stay with SCHED_BATCH and nice +19.

This means being less strict but (as you imply) that may be not much 
better than nice +19.  I'll have a look at it.

Of course, a comprehensive (as opposed to RT only) priority inheritance 
mechanism would make the "safe/unsafe to background" problem go away and 
make this patch very simple.  Any plans in that direction?

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  8:03   ` Peter Williams
@ 2006-07-05  8:15     ` Arjan van de Ven
  2006-07-05  8:19     ` Ingo Molnar
  1 sibling, 0 replies; 21+ messages in thread
From: Arjan van de Ven @ 2006-07-05  8:15 UTC (permalink / raw)
  To: Peter Williams
  Cc: Ingo Molnar, Andrew Morton, Nick Piggin, Linux Kernel,
	Con Kolivas


> 
> Of course, a comprehensive (as opposed to RT only) priority inheritance 
> mechanism would make the "safe/unsafe to background" problem go away and 
> make this patch very simple.  Any plans in that direction?

there is a very simple prio inheritance mechanism available already:
just treat "running in kernel mode" as a "go to nice +19" thing, and I
suspect things will suddenly be safe ;)



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  8:03   ` Peter Williams
  2006-07-05  8:15     ` Arjan van de Ven
@ 2006-07-05  8:19     ` Ingo Molnar
  2006-07-05 17:40       ` Nick Piggin
  1 sibling, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2006-07-05  8:19 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas


* Peter Williams <pwil3058@bigpond.net.au> wrote:

> >>+static inline void inc_mutex_count(void)
> >>+{
> >>+	current->mutexes_held++;
> >>+}
> >>+
> >>+static inline void dec_mutex_count(void)
> >>+{
> >>+	current->mutexes_held--;
> >>+}
> >>+
> >
> >NACK! This whole patch is way too intrusive for such a relatively small 
> >gain.
> >
> >also, if something doesnt hold a mutex, it might still be unsafe to 
> >background it! For example if it holds a semaphore. Or an rwsem. Or any 
> >other kernel resource that has exclusion semantics.
> >
> >so unless this patch gets _much_ less complex and much less intrusive, 
> >we'll have to stay with SCHED_BATCH and nice +19.
> 
> This means being less strict but (as you imply) that may be not much 
> better than nice +19.  I'll have a look at it.

it's way too much pain for little gain.

> Of course, a comprehensive (as opposed to RT only) priority 
> inheritance mechanism would make the "safe/unsafe to background" 
> problem go away and make this patch very simple.  Any plans in that 
> direction?

that seems quite unlikely to happen. I think you are missing the biggest 
issue: for RT, if the priority inheritance mechanism does not extend to 
a given scheduling pattern it causes longer latencies, but no harm is 
done otherwise. But for SCHED_BGND we'd have to make sure _every_ place 
is priority-inversions safe - otherwise we risk a potential local DoS if 
a task with a critical resource is backgrounded! That's plain impossible 
to achieve.

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05  8:19     ` Ingo Molnar
@ 2006-07-05 17:40       ` Nick Piggin
  0 siblings, 0 replies; 21+ messages in thread
From: Nick Piggin @ 2006-07-05 17:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Peter Williams, Andrew Morton, Linux Kernel, Con Kolivas

Ingo Molnar wrote:
> * Peter Williams <pwil3058@bigpond.net.au> wrote:

>>Of course, a comprehensive (as opposed to RT only) priority 
>>inheritance mechanism would make the "safe/unsafe to background" 
>>problem go away and make this patch very simple.  Any plans in that 
>>direction?
> 
> 
> that seems quite unlikely to happen. I think you are missing the biggest 
> issue: for RT, if the priority inheritance mechanism does not extend to 
> a given scheduling pattern it causes longer latencies, but no harm is 
> done otherwise. But for SCHED_BGND we'd have to make sure _every_ place 
> is priority-inversions safe - otherwise we risk a potential local DoS if 
> a task with a critical resource is backgrounded! That's plain impossible 
> to achieve.

Right. And it isn't just straightforward things like locks, but
any limited resource.

mempools and block device requests are two that come to mind.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-04 23:35 [PATCH] sched: Add SCHED_BGND (background) scheduling policy Peter Williams
                   ` (2 preceding siblings ...)
  2006-07-05  6:35 ` Ingo Molnar
@ 2006-07-05 11:42 ` Mike Galbraith
  2006-07-05 13:59   ` Peter Williams
  3 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2006-07-05 11:42 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas,
	Ingo Molnar

On Wed, 2006-07-05 at 09:35 +1000, Peter Williams wrote:

> @@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
>  	}
>  
>  	array = rq->active;
> -	if (unlikely(!array->nr_active)) {
> -		/*
> -		 * Switch the active and expired arrays.
> -		 */
> -		schedstat_inc(rq, sched_switch);
> -		rq->active = rq->expired;
> -		rq->expired = array;
> -		array = rq->active;
> -		rq->expired_timestamp = 0;
> -		rq->best_expired_prio = MAX_PRIO;
> -	}
> +	if (unlikely(!array->nr_active))
> +		array = switch_arrays(rq, MAX_PRIO);
>  
>  	idx = sched_find_first_bit(array->bitmap);
> +get_next:
>  	queue = array->queue + idx;
>  	next = list_entry(queue->next, struct task_struct, run_list);
> +	/* very strict backgrounding */
> +	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
> +		int tmp = sched_find_first_bit(rq->expired->bitmap);
> +
> +		if (likely(tmp < idx)) {
> +			array = switch_arrays(rq, idx);
> +			idx = tmp;
> +			goto get_next;

Won't this potentially expire the mutex holder which you specifically
protect in scheduler_tick() if it was preempted before being ticked?
The task in the expired array could also be a !safe_to_background() task
who already had a chance to run, and who's slice expired.

If it's worth protecting higher priority tasks from mutex holders ending
up in the expired array, then there's a case that should be examined.
There's little difference between a background task acquiring a mutex,
and a normal task with one tick left on it's slice.  Best for sleepers
is of course to just say no to expiring mutex holders period.

	-Mike


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05 11:42 ` Mike Galbraith
@ 2006-07-05 13:59   ` Peter Williams
  2006-07-05 14:18     ` Peter Williams
  2006-07-05 14:48     ` Mike Galbraith
  0 siblings, 2 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-05 13:59 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas,
	Ingo Molnar

Mike Galbraith wrote:
> On Wed, 2006-07-05 at 09:35 +1000, Peter Williams wrote:
> 
>> @@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
>>  	}
>>  
>>  	array = rq->active;
>> -	if (unlikely(!array->nr_active)) {
>> -		/*
>> -		 * Switch the active and expired arrays.
>> -		 */
>> -		schedstat_inc(rq, sched_switch);
>> -		rq->active = rq->expired;
>> -		rq->expired = array;
>> -		array = rq->active;
>> -		rq->expired_timestamp = 0;
>> -		rq->best_expired_prio = MAX_PRIO;
>> -	}
>> +	if (unlikely(!array->nr_active))
>> +		array = switch_arrays(rq, MAX_PRIO);
>>  
>>  	idx = sched_find_first_bit(array->bitmap);
>> +get_next:
>>  	queue = array->queue + idx;
>>  	next = list_entry(queue->next, struct task_struct, run_list);
>> +	/* very strict backgrounding */
>> +	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
>> +		int tmp = sched_find_first_bit(rq->expired->bitmap);
>> +
>> +		if (likely(tmp < idx)) {
>> +			array = switch_arrays(rq, idx);
>> +			idx = tmp;
>> +			goto get_next;
> 
> Won't this potentially expire the mutex holder which you specifically
> protect in scheduler_tick() if it was preempted before being ticked?

I don't think so as its prio value should cause task_in_background() to 
fail.

> The task in the expired array could also be a !safe_to_background() task
> who already had a chance to run, and who's slice expired.

If it's !safe_to_background() it's in our interest to let it run in 
order to free up the resource that it's holding.

> 
> If it's worth protecting higher priority tasks from mutex holders ending
> up in the expired array, then there's a case that should be examined.

It's more than just stopping them end up in the expired array.  It's 
stopping them being permanently in the expired array.

> There's little difference between a background task acquiring a mutex,
> and a normal task with one tick left on it's slice.

The difference is that the background task could stay there forever.

>  Best for sleepers
> is of course to just say no to expiring mutex holders period.

In spite of my comments above, I agree that not expiring mutex holders 
might (emphasis on the "might") be good for overall system performance 
by reducing the time for which locks are held.  Giving them a whole new 
time slice on the active array might be too generous though.  It could 
become quite complex.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05 13:59   ` Peter Williams
@ 2006-07-05 14:18     ` Peter Williams
  2006-07-05 14:48     ` Mike Galbraith
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-05 14:18 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas,
	Ingo Molnar

Peter Williams wrote:
> Mike Galbraith wrote:
>> On Wed, 2006-07-05 at 09:35 +1000, Peter Williams wrote:
>>
>>> @@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
>>>      }
>>>  
>>>      array = rq->active;
>>> -    if (unlikely(!array->nr_active)) {
>>> -        /*
>>> -         * Switch the active and expired arrays.
>>> -         */
>>> -        schedstat_inc(rq, sched_switch);
>>> -        rq->active = rq->expired;
>>> -        rq->expired = array;
>>> -        array = rq->active;
>>> -        rq->expired_timestamp = 0;
>>> -        rq->best_expired_prio = MAX_PRIO;
>>> -    }
>>> +    if (unlikely(!array->nr_active))
>>> +        array = switch_arrays(rq, MAX_PRIO);
>>>  
>>>      idx = sched_find_first_bit(array->bitmap);
>>> +get_next:
>>>      queue = array->queue + idx;
>>>      next = list_entry(queue->next, struct task_struct, run_list);
>>> +    /* very strict backgrounding */
>>> +    if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
>>> +        int tmp = sched_find_first_bit(rq->expired->bitmap);
>>> +
>>> +        if (likely(tmp < idx)) {
>>> +            array = switch_arrays(rq, idx);
>>> +            idx = tmp;
>>> +            goto get_next;
>>
>> Won't this potentially expire the mutex holder which you specifically
>> protect in scheduler_tick() if it was preempted before being ticked?
> 
> I don't think so as its prio value should cause task_in_background() to 
> fail.

Actually, you're right, a pre-emption at the wrong time could cause the 
prio value not to have been changed.  There needs to be a 
safe_to_background(next) in there somewhere.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05 13:59   ` Peter Williams
  2006-07-05 14:18     ` Peter Williams
@ 2006-07-05 14:48     ` Mike Galbraith
  2006-07-06 23:50       ` Peter Williams
  1 sibling, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2006-07-05 14:48 UTC (permalink / raw)
  To: Peter Williams
  Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas,
	Ingo Molnar

On Wed, 2006-07-05 at 23:59 +1000, Peter Williams wrote:
> Mike Galbraith wrote:
> > The task in the expired array could also be a !safe_to_background() task
> > who already had a chance to run, and who's slice expired.
> 
> If it's !safe_to_background() it's in our interest to let it run in 
> order to free up the resource that it's holding.

Only if there are waiters (or you know there will be some before the
holder gets a chance to run again).  Even then, they might be background
tasks, so it could still be ~wrong.

(yeah, comprehensive PI would be mucho tidier than tick time)

	-Mike


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  2006-07-05 14:48     ` Mike Galbraith
@ 2006-07-06 23:50       ` Peter Williams
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Williams @ 2006-07-06 23:50 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Andrew Morton, Nick Piggin, Linux Kernel, Con Kolivas,
	Ingo Molnar

Mike Galbraith wrote:
> On Wed, 2006-07-05 at 23:59 +1000, Peter Williams wrote:
>> Mike Galbraith wrote:
>>> The task in the expired array could also be a !safe_to_background() task
>>> who already had a chance to run, and who's slice expired.
>> If it's !safe_to_background() it's in our interest to let it run in 
>> order to free up the resource that it's holding.
> 
> Only if there are waiters (or you know there will be some before the
> holder gets a chance to run again).  Even then, they might be background
> tasks, so it could still be ~wrong.
> 
> (yeah, comprehensive PI would be mucho tidier than tick time)

Yes.  Unfortunately, in Ingo's opinion, even if we have comprehensive PI 
it's unlikely to be reliable enough to guarantee putting tasks into the 
background is safe.  Of course, this wouldn't detract from its general 
usefulness -- just makes it no good for SCHED_BGND/SCHED_IDLEPRIO purposes.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2006-07-06 23:50 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-04 23:35 [PATCH] sched: Add SCHED_BGND (background) scheduling policy Peter Williams
2006-07-05  0:14 ` Con Kolivas
2006-07-05  0:49   ` Peter Williams
2006-07-05  0:52     ` Con Kolivas
2006-07-05  8:05     ` Andreas Mohr
2006-07-05 14:04       ` Jan Engelhardt
2006-07-05  0:44 ` Con Kolivas
2006-07-05  1:15   ` Peter Williams
2006-07-05  1:33     ` Con Kolivas
2006-07-05  4:20       ` Valdis.Kletnieks
2006-07-05  3:06   ` Peter Williams
2006-07-05  6:35 ` Ingo Molnar
2006-07-05  8:03   ` Peter Williams
2006-07-05  8:15     ` Arjan van de Ven
2006-07-05  8:19     ` Ingo Molnar
2006-07-05 17:40       ` Nick Piggin
2006-07-05 11:42 ` Mike Galbraith
2006-07-05 13:59   ` Peter Williams
2006-07-05 14:18     ` Peter Williams
2006-07-05 14:48     ` Mike Galbraith
2006-07-06 23:50       ` Peter Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox