* [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt
[not found] <20061001112829.630288000@frodo>
@ 2006-10-01 11:36 ` Esben Nielsen
2006-11-10 14:49 ` Ingo Molnar
2006-10-01 11:36 ` [patch 2/5] " Esben Nielsen
` (3 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: Esben Nielsen @ 2006-10-01 11:36 UTC (permalink / raw)
To: Ingo Molnar; +Cc: linux-kernel, Thomas Gleixner, Steven Rostedt
This patch adds a new interface to the scheduler: A task can be scheduled in
LIFO order for a while instead of the usual FIFO order. This is more or less
equivalent to raising it's priority by 1/2.
This property is now needed by the rtmutexes to solve the last few issues, but
I can imagine it can be usefull for other subsystems too.
include/linux/init_task.h | 1
include/linux/sched.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 29 +++++++++++++++++----
3 files changed, 87 insertions(+), 5 deletions(-)
Index: linux-2.6.18-rt/include/linux/init_task.h
===================================================================
--- linux-2.6.18-rt.orig/include/linux/init_task.h
+++ linux-2.6.18-rt/include/linux/init_task.h
@@ -91,6 +91,7 @@ extern struct group_info init_groups;
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
+ .sched_lifo = 0, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
Index: linux-2.6.18-rt/include/linux/sched.h
===================================================================
--- linux-2.6.18-rt.orig/include/linux/sched.h
+++ linux-2.6.18-rt/include/linux/sched.h
@@ -928,6 +928,7 @@ struct task_struct {
int prio, static_prio, normal_prio;
struct list_head run_list;
struct prio_array *array;
+ int sched_lifo;
unsigned short ioprio;
unsigned int btrace_seq;
@@ -1329,6 +1330,67 @@ extern struct task_struct *curr_task(int
extern void set_curr_task(int cpu, struct task_struct *p);
extern void set_thread_priority(struct task_struct *p, int prio);
+/*
+ * sched_lifo: A task can be sched-lifo mode and be sure to be scheduled before
+ * any other task with the same or lower priority - except for later arriving
+ * tasks with the sched_lifo property set.
+ *
+ * It is supposed to work similar to irq-flags:
+ *
+ * int old_sched_lifo;
+ * ...
+ * old_sched_lifo = enter_sched_lifo();
+ * ...
+ * leave_sched_lifo(old_sched_lifo);
+ *
+ * The purpose is that the sched-lifo sections can be easily nested.
+ *
+ * With the get/enter/leave_sched_lifo_other() the lifo status on another task
+ * can be manipulated, The status is neither atomic, nor protected by any lock.
+ * Therefore it is up to the user of those function to ensure that the
+ * operations a properly serialized. The easiest will be not to use
+ * *_sched_lifo_other() functions.
+ */
+
+static inline int get_sched_lifo_other(struct task_struct *task)
+{
+ return task->sched_lifo;
+}
+
+static inline int get_sched_lifo(void)
+{
+ return get_sched_lifo_other(current);
+}
+
+static inline int enter_sched_lifo_other(struct task_struct *task)
+{
+ int old = task->sched_lifo;
+ task->sched_lifo = 1;
+ return old;
+}
+
+static inline int enter_sched_lifo(void)
+{
+ return enter_sched_lifo_other(current);
+}
+
+
+static inline void leave_sched_lifo_other(struct task_struct *task,
+ int old_value)
+{
+ task->sched_lifo = old_value;
+ /*
+ * if sched_lifo == 0 should we move to the tail of the runqueue ?
+ * what if we never sleeped while in sched_lifo ?
+ */
+}
+
+static inline void leave_sched_lifo(int old_value)
+{
+ leave_sched_lifo_other(current, old_value);
+}
+
+
void yield(void);
void __yield(void);
Index: linux-2.6.18-rt/kernel/sched.c
===================================================================
--- linux-2.6.18-rt.orig/kernel/sched.c
+++ linux-2.6.18-rt/kernel/sched.c
@@ -164,8 +164,8 @@
(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
+#define TASK_PREEMPTS(p,q) task_preempts(p,q)
+#define TASK_PREEMPTS_CURR(p, rq) TASK_PREEMPTS(p,(rq)->curr)
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
@@ -707,6 +707,17 @@ sched_info_switch(struct task_struct *pr
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+int task_preempts(struct task_struct *p, struct task_struct *q)
+{
+ if (p->prio < q->prio)
+ return 1;
+
+ if (p->prio > q->prio)
+ return 0;
+
+ return p->sched_lifo;
+}
+
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
static __cacheline_aligned_in_smp atomic_t rt_overload;
#endif
@@ -770,7 +781,12 @@ static void enqueue_task(struct task_str
{
WARN_ON_ONCE(p->flags & PF_DEAD);
sched_info_queued(p);
- list_add_tail(&p->run_list, array->queue + p->prio);
+
+ if (p->sched_lifo)
+ list_add(&p->run_list, array->queue + p->prio);
+ else
+ list_add_tail(&p->run_list, array->queue + p->prio);
+
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
@@ -783,7 +799,10 @@ static void enqueue_task(struct task_str
*/
static void requeue_task(struct task_struct *p, struct prio_array *array)
{
- list_move_tail(&p->run_list, array->queue + p->prio);
+ if (p->sched_lifo)
+ list_move(&p->run_list, array->queue + p->prio);
+ else
+ list_move_tail(&p->run_list, array->queue + p->prio);
}
static inline void
@@ -1463,7 +1482,7 @@ static void balance_rt_tasks(struct rq *
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
- if (p && (p->prio < next->prio)) {
+ if (p && TASK_PREEMPTS(p,next)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->array);
schedstat_inc(this_rq, rto_pulled);
--
^ permalink raw reply [flat|nested] 10+ messages in thread* [patch 3/5] Fix timeout bug in rtmutex in 2.6.18-rt
[not found] <20061001112829.630288000@frodo>
2006-10-01 11:36 ` [patch 1/5] Fix timeout bug in rtmutex in 2.6.18-rt Esben Nielsen
2006-10-01 11:36 ` [patch 2/5] " Esben Nielsen
@ 2006-10-01 11:37 ` Esben Nielsen
2006-10-01 11:37 ` [patch 4/5] " Esben Nielsen
2006-10-01 11:37 ` [patch 5/5] " Esben Nielsen
4 siblings, 0 replies; 10+ messages in thread
From: Esben Nielsen @ 2006-10-01 11:37 UTC (permalink / raw)
To: Ingo Molnar; +Cc: linux-kernel, Thomas Gleixner, Steven Rostedt
This patch fixes
1) The timeout bug in rtmutexes and PI futexes: When a task is blocked on a
lock with timeout and times out it will not wakeup until the owner of the lock
is done. This is because the owner is boosted to the same priority as the
blocked task and therefore has the CPU such the blocked task never gets around
to de-boost it!
2) setscheduler() now does the PI walking - but defers the work to the blocked
task.
3) In general it makes sure that a task, which is boosting another has enough
priority to do the de-boosting no matter how complicated the lock structure is,
or how many times the priorities have changed.
The idea behind the patch is simple:
If a task is boosting another it is scheduled in LIFO order and it will never
loose it's priority. This property lasts until it has left the lock operation
(successfully or not).
The needed priority to do the unboosting is stored in task->boosting_prio.
In the current patch this is always increasing (numerically decreasing) while
trying to take a lock. In a future it might be found safe to decrease
boosting_prio before finally leaving the lock operation.
include/linux/rtmutex.h | 1
include/linux/sched.h | 6
kernel/fork.c | 1
kernel/rtmutex.c | 142 ++++++++++++------
kernel/rtmutex_common.h | 1
kernel/sched.c | 21 ++
scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst | 36 ++--
scripts/rt-tester/t5-l4-pi-boost-deboost.tst | 12 -
8 files changed, 156 insertions(+), 64 deletions(-)
Index: linux-2.6.18-rt/include/linux/rtmutex.h
===================================================================
--- linux-2.6.18-rt.orig/include/linux/rtmutex.h
+++ linux-2.6.18-rt/include/linux/rtmutex.h
@@ -99,6 +99,7 @@ extern void rt_mutex_unlock(struct rt_mu
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \
+ .boosting_prio = MAX_PRIO, \
INIT_RT_MUTEX_DEBUG(tsk)
#else
# define INIT_RT_MUTEXES(tsk)
Index: linux-2.6.18-rt/include/linux/sched.h
===================================================================
--- linux-2.6.18-rt.orig/include/linux/sched.h
+++ linux-2.6.18-rt/include/linux/sched.h
@@ -1069,6 +1069,12 @@ struct task_struct {
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
+
+ /*
+ * Priority to which this task is boosting something. Don't lower the
+ * priority below this before the boosting is resolved;
+ */
+ int boosting_prio;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
Index: linux-2.6.18-rt/kernel/fork.c
===================================================================
--- linux-2.6.18-rt.orig/kernel/fork.c
+++ linux-2.6.18-rt/kernel/fork.c
@@ -966,6 +966,7 @@ static inline void rt_mutex_init_task(st
spin_lock_init(&p->pi_lock);
plist_head_init(&p->pi_waiters, &p->pi_lock);
p->pi_blocked_on = NULL;
+ p->boosting_prio = MAX_PRIO;
# ifdef CONFIG_DEBUG_RT_MUTEXES
p->last_kernel_lock = NULL;
# endif
Index: linux-2.6.18-rt/kernel/rtmutex.c
===================================================================
--- linux-2.6.18-rt.orig/kernel/rtmutex.c
+++ linux-2.6.18-rt/kernel/rtmutex.c
@@ -126,7 +126,7 @@ static inline void init_lists(struct rt_
* Return task->normal_prio when the waiter list is empty or when
* the waiter is not allowed to do priority boosting
*/
-int rt_mutex_getprio(struct task_struct *task)
+static int rt_mutex_getprio_real(struct task_struct *task)
{
if (likely(!task_has_pi_waiters(task)))
return task->normal_prio;
@@ -135,6 +135,11 @@ int rt_mutex_getprio(struct task_struct
task->normal_prio);
}
+int rt_mutex_getprio(struct task_struct *task)
+{
+ return min(rt_mutex_getprio_real(task), task->boosting_prio);
+}
+
/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
@@ -149,6 +154,20 @@ static void __rt_mutex_adjust_prio(struc
}
/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting.
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
* Adjust task priority (undo boosting). Called from the exit path of
* rt_mutex_slowunlock() and rt_mutex_slowlock().
*
@@ -157,11 +176,14 @@ static void __rt_mutex_adjust_prio(struc
* of task. We do not use the spin_xx_mutex() variants here as we are
* outside of the debug path.)
*/
-static void rt_mutex_adjust_prio(struct task_struct *task)
+static void rt_mutex_adjust_prio_final(struct task_struct *task,
+ int old_sched_lifo)
{
unsigned long flags;
spin_lock_irqsave(&task->pi_lock, flags);
+ task->boosting_prio = MAX_PRIO;
+ leave_sched_lifo_other(task, old_sched_lifo);
__rt_mutex_adjust_prio(task);
spin_unlock_irqrestore(&task->pi_lock, flags);
}
@@ -237,7 +259,8 @@ static int rt_mutex_adjust_prio_chain(st
* When deadlock detection is off then we check, if further
* priority adjustment is necessary.
*/
- if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+ if (!detect_deadlock &&
+ waiter->list_entry.prio == rt_mutex_getprio_real(task))
goto out_unlock_pi;
lock = waiter->lock;
@@ -259,7 +282,9 @@ static int rt_mutex_adjust_prio_chain(st
/* Requeue the waiter */
plist_del(&waiter->list_entry, &lock->wait_list);
- waiter->list_entry.prio = task->prio;
+ waiter->list_entry.prio = rt_mutex_getprio_real(task);
+ task->boosting_prio = min(task->boosting_prio,
+ waiter->list_entry.prio);
plist_add(&waiter->list_entry, &lock->wait_list);
/* Release the task */
@@ -306,6 +331,18 @@ static int rt_mutex_adjust_prio_chain(st
}
/*
+ * Calls the rt_mutex_adjust_prio_chain() above
+ * whith unlocking and locking lock->wait_lock.
+ */
+static void rt_mutex_adjust_prio_chain_unlock(struct rt_mutex *lock)
+{
+ spin_unlock(&lock->wait_lock);
+ get_task_struct(current);
+ rt_mutex_adjust_prio_chain(current, 0, NULL, NULL, NULL);
+ spin_lock(&lock->wait_lock);
+}
+
+/*
* Optimization: check if we can steal the lock from the
* assigned pending owner [which might not have taken the
* lock yet]:
@@ -315,6 +352,7 @@ static inline int try_to_steal_lock(stru
struct task_struct *pendowner = rt_mutex_owner(lock);
struct rt_mutex_waiter *next;
unsigned long flags;
+ int current_prio;
if (!rt_mutex_owner_pending(lock))
return 0;
@@ -322,8 +360,12 @@ static inline int try_to_steal_lock(stru
if (pendowner == current)
return 1;
+ spin_lock_irqsave(¤t->pi_lock, flags);
+ current_prio = rt_mutex_getprio_real(current);
+ spin_unlock_irqrestore(¤t->pi_lock, flags);
+
spin_lock_irqsave(&pendowner->pi_lock, flags);
- if (current->prio >= pendowner->prio) {
+ if (current_prio >= rt_mutex_getprio_real(pendowner)) {
spin_unlock_irqrestore(&pendowner->pi_lock, flags);
return 0;
}
@@ -432,8 +474,8 @@ static int task_blocks_on_rt_mutex(struc
__rt_mutex_adjust_prio(current);
waiter->task = current;
waiter->lock = lock;
- plist_node_init(&waiter->list_entry, current->prio);
- plist_node_init(&waiter->pi_list_entry, current->prio);
+ plist_node_init(&waiter->list_entry, rt_mutex_getprio_real(current));
+ plist_node_init(&waiter->pi_list_entry, waiter->list_entry.prio);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -441,7 +483,9 @@ static int task_blocks_on_rt_mutex(struc
plist_add(&waiter->list_entry, &lock->wait_list);
current->pi_blocked_on = waiter;
-
+ current->boosting_prio = min(current->boosting_prio,
+ waiter->list_entry.prio);
+ enter_sched_lifo();
spin_unlock_irqrestore(¤t->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -507,7 +551,6 @@ static void wakeup_next_waiter(struct rt
plist_del(&waiter->pi_list_entry, ¤t->pi_waiters);
pendowner = waiter->task;
waiter->task = NULL;
-
rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
spin_unlock_irqrestore(¤t->pi_lock, flags);
@@ -526,6 +569,8 @@ static void wakeup_next_waiter(struct rt
WARN_ON(pendowner->pi_blocked_on->lock != lock);
pendowner->pi_blocked_on = NULL;
+ pendowner->boosting_prio = MAX_PRIO;
+ leave_sched_lifo_other(pendowner, waiter->old_sched_lifo);
if (rt_mutex_has_waiters(lock)) {
struct rt_mutex_waiter *next;
@@ -533,6 +578,7 @@ static void wakeup_next_waiter(struct rt
next = rt_mutex_top_waiter(lock);
plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
}
+ __rt_mutex_adjust_prio(pendowner);
spin_unlock_irqrestore(&pendowner->pi_lock, flags);
if (savestate)
@@ -660,6 +706,7 @@ rt_spin_lock_slowlock(struct rt_mutex *l
{
struct rt_mutex_waiter waiter;
unsigned long saved_state, state;
+ int adjust_prio_final = 0;
debug_rt_mutex_init_waiter(&waiter);
waiter.task = NULL;
@@ -686,6 +733,7 @@ rt_spin_lock_slowlock(struct rt_mutex *l
* then we return with the saved state.
*/
saved_state = xchg(¤t->state, TASK_UNINTERRUPTIBLE);
+ waiter.old_sched_lifo = get_sched_lifo();
for (;;) {
unsigned long saved_flags;
@@ -699,12 +747,16 @@ rt_spin_lock_slowlock(struct rt_mutex *l
* when we have been woken up by the previous owner
* but the lock got stolen by an higher prio task.
*/
- if (!waiter.task) {
+ if (!waiter.task)
task_blocks_on_rt_mutex(lock, &waiter, 0);
- /* Wakeup during boost ? */
- if (unlikely(!waiter.task))
- continue;
- }
+ else
+ rt_mutex_adjust_prio_chain_unlock(lock);
+
+ /*
+ * We got it while lock->wait_lock was unlocked ?
+ */
+ if (unlikely(!waiter.task))
+ continue;
/*
* Prevent schedule() to drop BKL, while waiting for
@@ -737,8 +789,10 @@ rt_spin_lock_slowlock(struct rt_mutex *l
* highest-prio waiter (due to SCHED_OTHER changing prio), then we
* can end up with a non-NULL waiter.task:
*/
- if (unlikely(waiter.task))
+ if (unlikely(waiter.task)) {
remove_waiter(lock, &waiter);
+ adjust_prio_final = 1;
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit
* unconditionally. We might have to fix that up:
@@ -747,6 +801,11 @@ rt_spin_lock_slowlock(struct rt_mutex *l
spin_unlock(&lock->wait_lock);
+ if (adjust_prio_final)
+ rt_mutex_adjust_prio_final(current, waiter.old_sched_lifo);
+
+ BUG_ON(current->boosting_prio != MAX_PRIO);
+ BUG_ON(waiter.old_sched_lifo != get_sched_lifo());
debug_rt_mutex_free_waiter(&waiter);
}
@@ -757,21 +816,16 @@ static void fastcall noinline __sched
rt_spin_lock_slowunlock(struct rt_mutex *lock)
{
spin_lock(&lock->wait_lock);
-
debug_rt_mutex_unlock(lock);
-
rt_mutex_deadlock_account_unlock(current);
-
if (!rt_mutex_has_waiters(lock)) {
lock->owner = NULL;
spin_unlock(&lock->wait_lock);
return;
}
-
wakeup_next_waiter(lock, 1);
-
spin_unlock(&lock->wait_lock);
-
+ BUG_ON(current->boosting_prio != MAX_PRIO);
/* Undo pi boosting.when necessary */
rt_mutex_adjust_prio(current);
}
@@ -918,6 +972,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
{
int ret = 0, saved_lock_depth = -1;
struct rt_mutex_waiter waiter;
+ int adjust_prio_final = 0;
debug_rt_mutex_init_waiter(&waiter);
waiter.task = NULL;
@@ -939,6 +994,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
saved_lock_depth = rt_release_bkl(lock);
set_current_state(state);
+ waiter.old_sched_lifo = get_sched_lifo();
/* Setup the timer, when timeout != NULL */
if (unlikely(timeout))
@@ -971,20 +1027,23 @@ rt_mutex_slowlock(struct rt_mutex *lock,
* when we have been woken up by the previous owner
* but the lock got stolen by a higher prio task.
*/
- if (!waiter.task) {
+ if (!waiter.task)
ret = task_blocks_on_rt_mutex(lock, &waiter,
detect_deadlock);
- /*
- * If we got woken up by the owner then start loop
- * all over without going into schedule to try
- * to get the lock now:
- */
- if (unlikely(!waiter.task))
- continue;
+ else
+ rt_mutex_adjust_prio_chain_unlock(lock);
+
+ /*
+ * If we got woken up by the owner then start loop
+ * all over without going into schedule to try
+ * to get the lock now:
+ */
+ if (unlikely(!waiter.task))
+ continue;
+
+ if (unlikely(ret))
+ break;
- if (unlikely(ret))
- break;
- }
saved_flags = current->flags & PF_NOSCHED;
current->flags &= ~PF_NOSCHED;
@@ -1003,8 +1062,12 @@ rt_mutex_slowlock(struct rt_mutex *lock,
set_current_state(TASK_RUNNING);
- if (unlikely(waiter.task))
+ if (unlikely(waiter.task)) {
remove_waiter(lock, &waiter);
+ adjust_prio_final = 1;
+ }
+ else if (unlikely(ret))
+ adjust_prio_final = 1;
/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -1014,17 +1077,13 @@ rt_mutex_slowlock(struct rt_mutex *lock,
spin_unlock(&lock->wait_lock);
+ if (adjust_prio_final)
+ rt_mutex_adjust_prio_final(current, waiter.old_sched_lifo);
+
/* Remove pending timer: */
if (unlikely(timeout))
hrtimer_cancel(&timeout->timer);
- /*
- * Readjust priority, when we did not get the lock. We might
- * have been the pending owner and boosted. Since we did not
- * take the lock, the PI boost has to go.
- */
- if (unlikely(ret))
- rt_mutex_adjust_prio(current);
/* Must we reaquire the BKL? */
if (unlikely(saved_lock_depth >= 0))
@@ -1032,6 +1091,8 @@ rt_mutex_slowlock(struct rt_mutex *lock,
debug_rt_mutex_free_waiter(&waiter);
+ BUG_ON(current->boosting_prio != MAX_PRIO);
+
return ret;
}
@@ -1085,6 +1146,7 @@ rt_mutex_slowunlock(struct rt_mutex *loc
spin_unlock(&lock->wait_lock);
/* Undo pi boosting if necessary: */
+ BUG_ON(current->boosting_prio != MAX_PRIO);
rt_mutex_adjust_prio(current);
}
Index: linux-2.6.18-rt/kernel/rtmutex_common.h
===================================================================
--- linux-2.6.18-rt.orig/kernel/rtmutex_common.h
+++ linux-2.6.18-rt/kernel/rtmutex_common.h
@@ -49,6 +49,7 @@ struct rt_mutex_waiter {
struct plist_node pi_list_entry;
struct task_struct *task;
struct rt_mutex *lock;
+ int old_sched_lifo;
#ifdef CONFIG_DEBUG_RT_MUTEXES
unsigned long ip;
pid_t deadlock_task_pid;
Index: linux-2.6.18-rt/kernel/sched.c
===================================================================
--- linux-2.6.18-rt.orig/kernel/sched.c
+++ linux-2.6.18-rt/kernel/sched.c
@@ -4654,9 +4654,15 @@ void set_thread_priority(struct task_str
resched_task(rq->curr);
}
__task_rq_unlock(rq);
- spin_unlock_irqrestore(&p->pi_lock, flags);
- rt_mutex_adjust_pi(p);
+ /*
+ * If the process is blocked on rt-mutex, it will now wake up and
+ * reinsert itself into the wait list and boost the owner correctly
+ */
+ if (p->pi_blocked_on)
+ wake_up_process_mutex(p);
+
+ spin_unlock_irqrestore(&p->pi_lock, flags);
}
/**
@@ -4759,9 +4765,15 @@ recheck:
resched_task(rq->curr);
}
__task_rq_unlock(rq);
- spin_unlock_irqrestore(&p->pi_lock, flags);
- rt_mutex_adjust_pi(p);
+ /*
+ * If the process is blocked on rt-mutex, it will now wake up and
+ * reinsert itself into the wait list and boost the owner correctly
+ */
+ if (p->pi_blocked_on)
+ wake_up_process_mutex(p);
+
+ spin_unlock_irqrestore(&p->pi_lock, flags);
return 0;
}
@@ -7627,4 +7639,3 @@ void notrace preempt_enable_no_resched(v
EXPORT_SYMBOL(preempt_enable_no_resched);
#endif
-
Index: linux-2.6.18-rt/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
===================================================================
--- linux-2.6.18-rt.orig/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
+++ linux-2.6.18-rt/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
@@ -107,10 +107,10 @@ T: prioeq: 3: 84
# Reduce prio of T4
C: schedfifo: 4: 80
T: prioeq: 0: 83
-T: prioeq: 1: 83
-T: prioeq: 2: 83
-T: prioeq: 3: 83
-T: prioeq: 4: 80
+T: prioeq: 1: 84
+T: prioeq: 2: 84
+T: prioeq: 3: 84
+T: prioeq: 4: 84
# Increase prio of T4
C: schedfifo: 4: 84
@@ -139,36 +139,46 @@ T: prioeq: 4: 84
# Reduce prio of T3
C: schedfifo: 3: 83
T: prioeq: 0: 84
-T: prioeq: 1: 84
-T: prioeq: 2: 84
-T: prioeq: 3: 84
+T: prioeq: 1: 85
+T: prioeq: 2: 85
+T: prioeq: 3: 85
T: prioeq: 4: 84
# Signal T4
C: signal: 4: 0
W: unlocked: 4: 3
T: prioeq: 0: 83
-T: prioeq: 1: 83
-T: prioeq: 2: 83
-T: prioeq: 3: 83
+T: prioeq: 1: 85
+T: prioeq: 2: 85
+T: prioeq: 3: 85
+T: prioeq: 4: 84
# Signal T3
C: signal: 3: 0
W: unlocked: 3: 2
T: prioeq: 0: 82
-T: prioeq: 1: 82
-T: prioeq: 2: 82
+T: prioeq: 1: 85
+T: prioeq: 2: 85
+T: prioeq: 3: 83
+T: prioeq: 4: 84
# Signal T2
C: signal: 2: 0
W: unlocked: 2: 1
T: prioeq: 0: 81
-T: prioeq: 1: 81
+T: prioeq: 1: 85
+T: prioeq: 2: 82
+T: prioeq: 3: 83
+T: prioeq: 4: 84
# Signal T1
C: signal: 1: 0
W: unlocked: 1: 0
T: priolt: 0: 1
+T: prioeq: 1: 81
+T: prioeq: 2: 82
+T: prioeq: 3: 83
+T: prioeq: 4: 84
# Unlock and exit
C: unlock: 3: 3
Index: linux-2.6.18-rt/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
===================================================================
--- linux-2.6.18-rt.orig/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
+++ linux-2.6.18-rt/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
@@ -108,22 +108,22 @@ T: prioeq: 3: 84
C: signal: 4: 0
W: unlocked: 4: 3
T: prioeq: 0: 83
-T: prioeq: 1: 83
-T: prioeq: 2: 83
-T: prioeq: 3: 83
+T: prioeq: 1: 84
+T: prioeq: 2: 84
+T: prioeq: 3: 84
# Signal T3
C: signal: 3: 0
W: unlocked: 3: 2
T: prioeq: 0: 82
-T: prioeq: 1: 82
-T: prioeq: 2: 82
+T: prioeq: 1: 84
+T: prioeq: 2: 84
# Signal T2
C: signal: 2: 0
W: unlocked: 2: 1
T: prioeq: 0: 81
-T: prioeq: 1: 81
+T: prioeq: 1: 84
# Signal T1
C: signal: 1: 0
--
^ permalink raw reply [flat|nested] 10+ messages in thread* [patch 4/5] Fix timeout bug in rtmutex in 2.6.18-rt
[not found] <20061001112829.630288000@frodo>
` (2 preceding siblings ...)
2006-10-01 11:37 ` [patch 3/5] " Esben Nielsen
@ 2006-10-01 11:37 ` Esben Nielsen
2006-10-01 11:37 ` [patch 5/5] " Esben Nielsen
4 siblings, 0 replies; 10+ messages in thread
From: Esben Nielsen @ 2006-10-01 11:37 UTC (permalink / raw)
To: Ingo Molnar; +Cc: linux-kernel, Thomas Gleixner, Steven Rostedt
Documentation/rt-mutex-design.txt | 45 +++++++++++++++++++++++++++++++++-----
1 file changed, 40 insertions(+), 5 deletions(-)
Index: linux-2.6.18-rt/Documentation/rt-mutex-design.txt
===================================================================
--- linux-2.6.18-rt.orig/Documentation/rt-mutex-design.txt
+++ linux-2.6.18-rt/Documentation/rt-mutex-design.txt
@@ -229,6 +229,23 @@ pi_list. This list is protected by a sp
called pi_lock. This lock may also be taken in interrupt context, so when
locking the pi_lock, interrupts must be disabled.
+Boosting prio sched lifo
+------------------------
+A task's boosting prio is used to avoid lowering a task priority below what
+the priority it is boosting another to. A task is never given a priority
+less than boosting prio. This is needed because in the event of a signal or a
+timeout the task has to perform the deboosting of the owner. But if it
+already has lower priority it might not get the needed CPU to do that.
+boosting prio is reset when the task gets the lock or it is signalled/timed out
+and have done it's deboosting. For that reason the extra boosting is only
+giving to tasks running inside the down operations. But while inside a down
+operation boosting prio is never lowered. That is the task priority is never
+decreased.
+While boosting prio is active the task is made "sched lifo". I.e. it schedules
+in lifo order among equal priority tasks instead of the usual fifo order. This
+is again done to make sure that a task being signalled while sleeping inside
+the down operation will get the CPU before the task being boosted, such it can
+perform the deboost operation.
Depth of the PI Chain
---------------------
@@ -370,11 +387,21 @@ rt_mutex_getprio and rt_mutex_setprio ar
rt_mutex_getprio returns the priority that the task should have. Either the
task's own normal priority, or if a process of a higher priority is waiting on
-a mutex owned by the task, then that higher priority should be returned.
+a mutex owned by the task, then that higher priority should be returned. If
+the task is currently boosting another task to some priority, rt_mutex_getprio
+will return at least that priority. This is because deboosting in the event of
+a signal or a timeout must be performed by this task. But if this task has
+lower priority than the task being boosted it might not get a CPU to run on -
+especially on UP machines.
+rt_mutex_getprio_real returns the task's priority without taking boosting_prio
+into account. This is used as the priority for the waiter list and thus
+also for the PI list. If rt_mutex_getprio was used the order in the wait_list
+would be incorrect and the task would boost too much.
+
Since the pi_list of a task holds an order by priority list of all the top
-waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs
-to compare the top pi waiter to its own normal priority, and return the higher
-priority back.
+waiters of all the mutexes that the task owns, rt_mutex_getprio_real simply
+needs to compare the top pi waiter to its own normal priority, and return the
+higher priority back.
(Note: if looking at the code, you will notice that the lower number of
prio is returned. This is because the prio field in the task structure
@@ -680,6 +707,10 @@ If the owner is also blocked on a lock,
(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+In the second round of the loop we might fail getting the lock but
+waiter.task might not be NULL. In that case we also release the wait_lock and
+call rt_mutex_adjust_prio_chain in case some priority was changed.
+
Now all locks are released, and if the current process is still blocked on a
mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
@@ -690,6 +721,7 @@ The schedule can then wake up for a few
1) we were given pending ownership of the mutex.
2) we received a signal and was TASK_INTERRUPTIBLE
3) we had a timeout and was TASK_INTERRUPTIBLE
+ 4) our priority was changed and we have to perform priority adjustments
In any of these cases, we continue the loop and once again try to grab the
ownership of the mutex. If we succeed, we exit the loop, otherwise we continue
@@ -711,6 +743,8 @@ pi_list of the owner. If this process w
the rt_mutex_adjust_prio_chain needs to be executed again on the owner,
but this time it will be lowering the priorities.
+At the end we need to reset boosting prio and adjust our own priority.
+
Unlocking the Mutex
-------------------
@@ -759,8 +793,10 @@ We now clear the "pi_blocked_on" field o
the mutex still has waiters pending, we add the new top waiter to the pi_list
of the pending owner.
-Finally we unlock the pi_lock of the pending owner and wake it up.
+We also reset the boosting prio of the pending owner and removes the sched lifo
+property: Now it is the owner and it can't be boosting anyone.
+Finally we unlock the pi_lock of the pending owner and wake it up.
Contact
-------
--
^ permalink raw reply [flat|nested] 10+ messages in thread