* [patch 05/10] PI-futex: rt-mutex core
@ 2006-03-25 18:46 Ingo Molnar
2006-03-26 5:35 ` Andrew Morton
2006-03-26 6:07 ` Andrew Morton
0 siblings, 2 replies; 12+ messages in thread
From: Ingo Molnar @ 2006-03-25 18:46 UTC (permalink / raw)
To: linux-kernel
Cc: Thomas Gleixner, Linus Torvalds, Andrew Morton, Arjan van de Ven
From: Ingo Molnar <mingo@elte.hu>
core functions for the rt-mutex subsystem.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
----
include/linux/init_task.h | 1
include/linux/rtmutex.h | 106 ++++
include/linux/rtmutex_internal.h | 157 ++++++
include/linux/sched.h | 11
init/Kconfig | 5
kernel/Makefile | 1
kernel/fork.c | 3
kernel/rtmutex.c | 941 +++++++++++++++++++++++++++++++++++++++
kernel/rtmutex.h | 28 +
9 files changed, 1253 insertions(+)
Index: linux-pi-futex.mm.q/include/linux/init_task.h
===================================================================
--- linux-pi-futex.mm.q.orig/include/linux/init_task.h
+++ linux-pi-futex.mm.q/include/linux/init_task.h
@@ -123,6 +123,7 @@ extern struct group_info init_groups;
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
+ INIT_RT_MUTEXES(tsk) \
}
Index: linux-pi-futex.mm.q/include/linux/rtmutex.h
===================================================================
--- /dev/null
+++ linux-pi-futex.mm.q/include/linux/rtmutex.h
@@ -0,0 +1,106 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef __LINUX_RT_MUTEX_H
+#define __LINUX_RT_MUTEX_H
+
+#include <linux/linkage.h>
+#include <linux/plist.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * The rt_mutex structure
+ *
+ * @wait_lock: spinlock to protect the structure
+ * @wait_list: pilist head to enqueue waiters in priority order
+ * @owner: the mutex owner
+ */
+struct rt_mutex {
+ spinlock_t wait_lock;
+ struct plist_head wait_list;
+ struct task_struct *owner;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+ int save_state;
+ struct list_head held_list;
+ unsigned long acquire_ip;
+ const char *name, *file;
+ int line;
+ void *magic;
+# endif
+};
+
+struct rt_mutex_waiter;
+struct hrtimer_sleeper;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+ , .name = #mutexname, .file = __FILE__, .line = __LINE__
+# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __FUNCTION__)
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
+#else
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL)
+# define rt_mutex_debug_task_free(t) do { } while (0)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+ { .wait_lock = SPIN_LOCK_UNLOCKED \
+ , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
+ , .owner = NULL \
+ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+
+#define DEFINE_RT_MUTEX(mutexname) \
+ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
+
+/***
+ * rt_mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int fastcall rt_mutex_is_locked(struct rt_mutex *lock)
+{
+ return lock->owner != NULL;
+}
+
+extern void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void fastcall rt_mutex_destroy(struct rt_mutex *lock);
+
+extern void fastcall rt_mutex_lock(struct rt_mutex *lock);
+extern int fastcall rt_mutex_lock_interruptible(struct rt_mutex *lock,
+ int detect_deadlock);
+extern int fastcall rt_mutex_timed_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock);
+
+extern int fastcall rt_mutex_trylock(struct rt_mutex *lock);
+
+extern void fastcall rt_mutex_unlock(struct rt_mutex *lock);
+
+#ifdef CONFIG_RT_MUTEXES
+# define rt_mutex_init_task(p) \
+ do { \
+ spin_lock_init(&p->pi_lock); \
+ plist_head_init(&p->pi_waiters); \
+ p->pi_blocked_on = NULL; \
+ p->pi_locked_by = NULL; \
+ INIT_LIST_HEAD(&p->pi_lock_chain); \
+ } while (0)
+# define INIT_RT_MUTEXES(tsk) \
+ .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \
+ .pi_lock = SPIN_LOCK_UNLOCKED, \
+ .pi_lock_chain = LIST_HEAD_INIT(tsk.pi_lock_chain),
+#else
+# define rt_mutex_init_task(p) do { } while (0)
+# define INIT_RT_MUTEXES(tsk)
+#endif
+
+#endif
Index: linux-pi-futex.mm.q/include/linux/rtmutex_internal.h
===================================================================
--- /dev/null
+++ linux-pi-futex.mm.q/include/linux/rtmutex_internal.h
@@ -0,0 +1,157 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __LINUX_RT_MUTEX_INTERNAL_H
+#define __LINUX_RT_MUTEX_INTERNAL_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry: pi node to enqueue into the mutex waiters list
+ * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
+ * @task: task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+ struct plist_node list_entry;
+ struct plist_node pi_list_entry;
+ struct task_struct *task;
+ struct rt_mutex *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ unsigned long ip;
+ pid_t deadlock_task_pid;
+ struct rt_mutex *deadlock_lock;
+#endif
+};
+
+/*
+ * Plist wrapper macros
+ */
+#define rt_mutex_has_waiters(lock) (!plist_head_empty(&lock->wait_list))
+
+#define rt_mutex_top_waiter(lock) \
+({ struct rt_mutex_waiter *__w = plist_first_entry(&lock->wait_list, \
+ struct rt_mutex_waiter, list_entry); \
+ BUG_ON(__w->lock != lock); \
+ __w; \
+})
+
+#define task_has_pi_waiters(task) (!plist_head_empty(&task->pi_waiters))
+
+#define task_top_pi_waiter(task) \
+ plist_first_entry(&task->pi_waiters, struct rt_mutex_waiter, pi_list_entry)
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner bit1 bit0
+ * NULL 0 0 lock is free (fast acquire possible)
+ * NULL 0 1 invalid state
+ * NULL 1 0 invalid state
+ * NULL 1 1 invalid state
+ * taskpointer 0 0 lock is held (fast release possible)
+ * taskpointer 0 1 task is pending owner
+ * taskpointer 1 0 lock is held and has waiters
+ * taskpointer 1 1 task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ */
+#define RT_MUTEX_OWNER_PENDING 1UL
+#define RT_MUTEX_HAS_WAITERS 2UL
+#define RT_MUTEX_OWNER_MASKALL 3UL
+
+#define rt_mutex_owner(lock) \
+({ \
+ typecheck(struct rt_mutex *,(lock)); \
+ ((struct task_struct *)((unsigned long)((lock)->owner) & ~RT_MUTEX_OWNER_MASKALL)); \
+})
+
+#define rt_mutex_real_owner(lock) \
+({ \
+ typecheck(struct rt_mutex *,(lock)); \
+ ((struct task_struct *)((unsigned long)((lock)->owner) & ~RT_MUTEX_HAS_WAITERS)); \
+})
+
+#define rt_mutex_owner_pending(lock) \
+({ \
+ typecheck(struct rt_mutex *,(lock)); \
+ ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); \
+})
+
+static inline void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long msk)
+{
+ unsigned long val = ((unsigned long) owner) | msk;
+
+ if (rt_mutex_has_waiters(lock))
+ val |= RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *)(val);
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner;
+
+ owner = ((unsigned long) lock->owner) & ~RT_MUTEX_HAS_WAITERS;
+ lock->owner = (struct task_struct *)(owner);
+}
+
+static inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+
+#else
+
+# define rt_mutex_cmpxchg(l,c,n) (0)
+
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner = ((unsigned long) lock->owner)| RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *) owner;
+}
+
+#endif
+
+#endif
Index: linux-pi-futex.mm.q/include/linux/sched.h
===================================================================
--- linux-pi-futex.mm.q.orig/include/linux/sched.h
+++ linux-pi-futex.mm.q/include/linux/sched.h
@@ -36,6 +36,7 @@
#include <linux/seccomp.h>
#include <linux/rcupdate.h>
#include <linux/futex.h>
+#include <linux/rtmutex.h>
#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
@@ -858,6 +859,16 @@ struct task_struct {
/* Protection of the PI data structures: */
spinlock_t pi_lock;
+#ifdef CONFIG_RT_MUTEXES
+ /* PI waiters blocked on a rt_mutex held by this task */
+ struct plist_head pi_waiters;
+ /* Deadlock detection and priority inheritance handling */
+ struct rt_mutex_waiter *pi_blocked_on;
+ /* PI locking helpers */
+ struct task_struct *pi_locked_by;
+ struct list_head pi_lock_chain;
+#endif
+
#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
Index: linux-pi-futex.mm.q/init/Kconfig
===================================================================
--- linux-pi-futex.mm.q.orig/init/Kconfig
+++ linux-pi-futex.mm.q/init/Kconfig
@@ -361,9 +361,14 @@ config BASE_FULL
kernel data structures. This saves memory on small machines,
but may reduce performance.
+config RT_MUTEXES
+ boolean
+ select PLIST
+
config FUTEX
bool "Enable futex support" if EMBEDDED
default y
+ select RT_MUTEXES
help
Disabling this option will cause the kernel to be built without
support for "fast userspace mutexes". The resulting kernel may not
Index: linux-pi-futex.mm.q/kernel/Makefile
===================================================================
--- linux-pi-futex.mm.q.orig/kernel/Makefile
+++ linux-pi-futex.mm.q/kernel/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
Index: linux-pi-futex.mm.q/kernel/fork.c
===================================================================
--- linux-pi-futex.mm.q.orig/kernel/fork.c
+++ linux-pi-futex.mm.q/kernel/fork.c
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
void free_task(struct task_struct *tsk)
{
free_thread_info(tsk->thread_info);
+ rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -1042,6 +1043,8 @@ static task_t *copy_process(unsigned lon
mpol_fix_fork_child_flag(p);
#endif
+ rt_mutex_init_task(p);
+
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
Index: linux-pi-futex.mm.q/kernel/rtmutex.c
===================================================================
--- /dev/null
+++ linux-pi-futex.mm.q/kernel/rtmutex.c
@@ -0,0 +1,941 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include <linux/rtmutex_internal.h>
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return task->normal_prio;
+
+ return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+ task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+ int prio = rt_mutex_getprio(task);
+
+ if (task->prio != prio)
+ rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * PI-locking: we lock PI-dependencies opportunistically via trylock.
+ *
+ * In the overwhelming majority of cases the 'PI chain' is empty or at
+ * most 1-2 entries long, for which the trylock is sufficient,
+ * scalability-wise. The locking might look a bit scary, for which we
+ * apologize in advance :-)
+ *
+ * If any of the trylocks fails then we back out, task the global
+ * pi_conflicts_lock and take the locks again. This ensures deadlock-free
+ * but still scalable locking in the dependency graph, combined with
+ * the ability to reliably (and cheaply) detect user-space deadlocks.
+ */
+static DEFINE_SPINLOCK(pi_conflicts_lock);
+
+/*
+ * Lock the full boosting chain.
+ *
+ * If 'try' is set, we have to backout if we hit a owner who is
+ * running its own pi chain operation. We go back and take the slow
+ * path via the pi_conflicts_lock.
+ */
+static int lock_pi_chain(struct rt_mutex *act_lock,
+ struct rt_mutex_waiter *waiter,
+ struct list_head *lock_chain,
+ int try, int detect_deadlock)
+{
+ struct task_struct *owner;
+ struct rt_mutex *nextlock, *lock = act_lock;
+ struct rt_mutex_waiter *nextwaiter;
+
+ /*
+ * Debugging might turn deadlock detection on, unconditionally:
+ */
+ detect_deadlock = debug_rt_mutex_detect_deadlock(detect_deadlock);
+
+ for (;;) {
+ owner = rt_mutex_owner(lock);
+
+ /* Check for circular dependencies */
+ if (unlikely(owner->pi_locked_by == current)) {
+ debug_rt_mutex_deadlock(detect_deadlock, waiter, lock);
+ return detect_deadlock ? -EDEADLK : 0;
+ }
+
+ while (!spin_trylock(&owner->pi_lock)) {
+ /*
+ * Owner runs its own chain. Go back and take
+ * the slow path
+ */
+ if (try && owner->pi_locked_by == owner)
+ return -EBUSY;
+ cpu_relax();
+ }
+
+ BUG_ON(owner->pi_locked_by);
+ owner->pi_locked_by = current;
+ BUG_ON(!list_empty(&owner->pi_lock_chain));
+ list_add(&owner->pi_lock_chain, lock_chain);
+
+ /*
+ * When the owner is blocked on a lock, try to take
+ * the lock:
+ */
+ nextwaiter = owner->pi_blocked_on;
+
+ /* End of chain? */
+ if (!nextwaiter)
+ return 0;
+
+ nextlock = nextwaiter->lock;
+
+ /* Check for circular dependencies: */
+ if (unlikely(nextlock == act_lock ||
+ rt_mutex_owner(nextlock) == current)) {
+ debug_rt_mutex_deadlock(detect_deadlock, waiter,
+ nextlock);
+ list_del_init(&owner->pi_lock_chain);
+ owner->pi_locked_by = NULL;
+ spin_unlock(&owner->pi_lock);
+ return detect_deadlock ? -EDEADLK : 0;
+ }
+
+ /* Try to get nextlock->wait_lock: */
+ if (unlikely(!spin_trylock(&nextlock->wait_lock))) {
+ list_del_init(&owner->pi_lock_chain);
+ owner->pi_locked_by = NULL;
+ spin_unlock(&owner->pi_lock);
+ cpu_relax();
+ continue;
+ }
+
+ lock = nextlock;
+
+ /*
+ * If deadlock detection is done (or has to be done, as
+ * for userspace locks), we have to walk the full chain
+ * unconditionally.
+ */
+ if (detect_deadlock)
+ continue;
+
+ /*
+ * Optimization: we only have to continue up to the point
+ * where boosting/unboosting still has to be done:
+ */
+
+ /* Boost or unboost? */
+ if (waiter) {
+ /* If the top waiter has higher priority, stop: */
+ if (rt_mutex_top_waiter(lock)->list_entry.prio <=
+ waiter->list_entry.prio)
+ return 0;
+ } else {
+ /* If nextwaiter is not the top waiter, stop: */
+ if (rt_mutex_top_waiter(lock) != nextwaiter)
+ return 0;
+ }
+ }
+}
+
+/*
+ * Unlock the pi_chain:
+ */
+static void unlock_pi_chain(struct list_head *lock_chain)
+{
+ struct task_struct *owner, *tmp;
+
+ list_for_each_entry_safe(owner, tmp, lock_chain, pi_lock_chain) {
+ struct rt_mutex_waiter *waiter = owner->pi_blocked_on;
+
+ list_del_init(&owner->pi_lock_chain);
+ BUG_ON(!owner->pi_locked_by);
+ owner->pi_locked_by = NULL;
+ if (waiter)
+ spin_unlock(&waiter->lock->wait_lock);
+ spin_unlock(&owner->pi_lock);
+ }
+}
+
+/*
+ * Do the priority (un)boosting along the chain:
+ */
+static void adjust_pi_chain(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter,
+ struct list_head *lock_chain)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct list_head *curr = lock_chain->prev;
+
+ for (;;) {
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry,
+ &owner->pi_waiters);
+
+ if (waiter && waiter == rt_mutex_top_waiter(lock)) {
+ waiter->pi_list_entry.prio = waiter->task->prio;
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ }
+ __rt_mutex_adjust_prio(owner);
+
+ waiter = owner->pi_blocked_on;
+ if (!waiter || curr->prev == lock_chain)
+ return;
+
+ curr = curr->prev;
+ lock = waiter->lock;
+ owner = rt_mutex_owner(lock);
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->list_entry.prio = waiter->task->prio;
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ /*
+ * We can stop here, if the waiter is/was not the top
+ * priority waiter:
+ */
+ if (top_waiter != waiter &&
+ waiter != rt_mutex_top_waiter(lock))
+ return;
+
+ /*
+ * Note: waiter is not necessarily the new top
+ * waiter!
+ */
+ waiter = rt_mutex_top_waiter(lock);
+ }
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and potentially propagate our priority into the pi chain.
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock __IP_DECL__)
+{
+ int res = 0;
+ struct rt_mutex_waiter *top_waiter = waiter;
+ LIST_HEAD(lock_chain);
+
+ waiter->task = current;
+ waiter->lock = lock;
+ debug_rt_mutex_reset_waiter(waiter);
+
+ spin_lock(¤t->pi_lock);
+ current->pi_locked_by = current;
+ plist_node_init(&waiter->list_entry, current->prio);
+ plist_node_init(&waiter->pi_list_entry, current->prio);
+
+ /* Get the top priority waiter of the lock: */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ current->pi_blocked_on = waiter;
+
+ /*
+ * Call adjust_prio_chain, when waiter is the new top waiter
+ * or when deadlock detection is requested:
+ */
+ if (waiter != rt_mutex_top_waiter(lock) &&
+ !debug_rt_mutex_detect_deadlock(detect_deadlock))
+ goto out;
+
+ /* Try to lock the full chain: */
+ res = lock_pi_chain(lock, waiter, &lock_chain, 1, detect_deadlock);
+
+ if (likely(!res))
+ adjust_pi_chain(lock, waiter, top_waiter, &lock_chain);
+
+ /* Common case: we managed to lock it: */
+ if (res != -EBUSY)
+ goto out_unlock;
+
+ /* Rare case: we hit some other task running a pi chain operation: */
+ unlock_pi_chain(&lock_chain);
+
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ current->pi_blocked_on = NULL;
+ current->pi_locked_by = NULL;
+ spin_unlock(¤t->pi_lock);
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ spin_lock(&pi_conflicts_lock);
+
+ spin_lock(¤t->pi_lock);
+ current->pi_locked_by = current;
+ spin_lock(&lock->wait_lock);
+ if (!rt_mutex_owner(lock)) {
+ waiter->task = NULL;
+ spin_unlock(&pi_conflicts_lock);
+ goto out;
+ }
+ plist_node_init(&waiter->list_entry, current->prio);
+ plist_node_init(&waiter->pi_list_entry, current->prio);
+
+ /* Get the top priority waiter of the lock: */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ current->pi_blocked_on = waiter;
+
+ /* Lock the full chain: */
+ res = lock_pi_chain(lock, waiter, &lock_chain, 0, detect_deadlock);
+
+ /* Drop the conflicts lock before adjusting: */
+ spin_unlock(&pi_conflicts_lock);
+
+ if (likely(!res))
+ adjust_pi_chain(lock, waiter, top_waiter, &lock_chain);
+
+ out_unlock:
+ unlock_pi_chain(&lock_chain);
+ out:
+ current->pi_locked_by = NULL;
+ spin_unlock(¤t->pi_lock);
+ return res;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+ struct task_struct *pendowner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *next;
+
+ if (!rt_mutex_owner_pending(lock))
+ return 0;
+
+ if (pendowner == current)
+ return 1;
+
+ spin_lock(&pendowner->pi_lock);
+ if (current->prio >= pendowner->prio) {
+ spin_unlock(&pendowner->pi_lock);
+ return 0;
+ }
+
+ /*
+ * Check if a waiter is enqueued on the pending owners
+ * pi_waiters list. Remove it and readjust pending owners
+ * priority.
+ */
+ if (likely(!rt_mutex_has_waiters(lock))) {
+ spin_unlock(&pendowner->pi_lock);
+ return 1;
+ }
+
+ /* No chain handling, pending owner is not blocked on anything: */
+ next = rt_mutex_top_waiter(lock);
+ plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+ __rt_mutex_adjust_prio(pendowner);
+ spin_unlock(&pendowner->pi_lock);
+
+ /*
+ * We are going to steal the lock and a waiter was
+ * enqueued on the pending owners pi_waiters queue. So
+ * we have to enqueue this waiter into
+ * current->pi_waiters list. This covers the case,
+ * where current is boosted because it holds another
+ * lock and gets unboosted because the booster is
+ * interrupted, so we would delay a waiter with higher
+ * priority as current->normal_prio.
+ *
+ * Note: in the rare case of a SCHED_OTHER task changing
+ * its priority and thus stealing the lock, next->task
+ * might be current:
+ */
+ if (likely(next->task != current)) {
+ spin_lock(¤t->pi_lock);
+ plist_add(&next->pi_list_entry, ¤t->pi_waiters);
+ __rt_mutex_adjust_prio(current);
+ spin_unlock(¤t->pi_lock);
+ }
+ return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+{
+ /*
+ * We have to be careful here if the atomic speedups are
+ * enabled, such that, when
+ * - no other waiter is on the lock
+ * - the lock has been released since we did the cmpxchg
+ * the lock can be released or taken while we are doing the
+ * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ *
+ * The atomic acquire/release aware variant of
+ * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+ * the WAITERS bit, the atomic release / acquire can not
+ * happen anymore and lock->wait_lock protects us from the
+ * non-atomic case.
+ *
+ * Note, that this might set lock->owner =
+ * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+ * any more. This is fixed up when we take the ownership.
+ */
+ mark_rt_mutex_waiters(lock);
+
+ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+ return 0;
+
+ /* We got the lock. */
+ debug_rt_mutex_lock(lock __IP__);
+
+ rt_mutex_set_owner(lock, current, 0);
+
+ rt_mutex_deadlock_account_lock(lock, current);
+
+ return 1;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *waiter;
+ struct task_struct *pendowner;
+
+ waiter = rt_mutex_top_waiter(lock);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+
+ /*
+ * Remove it from current->pi_waiters. We do not adjust a
+ * possible priority boost right now. We execute wakeup in the
+ * boosted mode and go back to normal after releasing
+ * lock->wait_lock.
+ */
+ spin_lock(¤t->pi_lock);
+ plist_del(&waiter->pi_list_entry, ¤t->pi_waiters);
+ spin_unlock(¤t->pi_lock);
+
+ pendowner = waiter->task;
+ waiter->task = NULL;
+
+ rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+ /*
+ * Clear the pi_blocked_on variable and enqueue a possible
+ * waiter into the pi_waiters list of the pending owner. This
+ * prevents that in case the pending owner gets unboosted a
+ * waiter with higher priority than pending-owner->normal_prio
+ * is blocked on the unboosted (pending) owner.
+ */
+ spin_lock(&pendowner->pi_lock);
+
+ WARN_ON(!pendowner->pi_blocked_on);
+ WARN_ON(pendowner->pi_blocked_on != waiter);
+ WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+ pendowner->pi_blocked_on = NULL;
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+ }
+ spin_unlock(&pendowner->pi_lock);
+
+ wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter __IP_DECL__)
+{
+ struct rt_mutex_waiter *next_waiter = NULL,
+ *top_waiter = rt_mutex_top_waiter(lock);
+ LIST_HEAD(lock_chain);
+ int res;
+
+ plist_del(&waiter->list_entry, &lock->wait_list);
+
+ spin_lock(¤t->pi_lock);
+
+ if (waiter != top_waiter || rt_mutex_owner(lock) == current)
+ goto out;
+
+ current->pi_locked_by = current;
+
+ if (rt_mutex_has_waiters(lock))
+ next_waiter = rt_mutex_top_waiter(lock);
+
+ /* Try to lock the full chain: */
+ res = lock_pi_chain(lock, next_waiter, &lock_chain, 1, 0);
+
+ if (likely(res != -EBUSY)) {
+ adjust_pi_chain(lock, next_waiter, waiter, &lock_chain);
+ goto out_unlock;
+ }
+
+ /* We hit some other task running a pi chain operation: */
+ unlock_pi_chain(&lock_chain);
+ plist_add(&waiter->list_entry, &lock->wait_list);
+ current->pi_blocked_on = waiter;
+ current->pi_locked_by = NULL;
+ spin_unlock(¤t->pi_lock);
+ spin_unlock(&lock->wait_lock);
+
+ spin_lock(&pi_conflicts_lock);
+
+ spin_lock(¤t->pi_lock);
+ current->pi_locked_by = current;
+
+ spin_lock(&lock->wait_lock);
+
+ /* We might have been woken up: */
+ if (!waiter->task) {
+ spin_unlock(&pi_conflicts_lock);
+ goto out;
+ }
+
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ plist_del(&waiter->list_entry, &lock->wait_list);
+
+ if (waiter != top_waiter || rt_mutex_owner(lock) == current)
+ goto out;
+
+ /* Get the top priority waiter of the lock: */
+ if (rt_mutex_has_waiters(lock))
+ next_waiter = rt_mutex_top_waiter(lock);
+
+ /* Lock the full chain: */
+ lock_pi_chain(lock, next_waiter, &lock_chain, 0, 0);
+
+ /* Drop the conflicts lock: */
+ spin_unlock(&pi_conflicts_lock);
+
+ adjust_pi_chain(lock, next_waiter, waiter, &lock_chain);
+
+ out_unlock:
+ unlock_pi_chain(&lock_chain);
+ out:
+ current->pi_blocked_on = NULL;
+ waiter->task = NULL;
+ current->pi_locked_by = NULL;
+ spin_unlock(¤t->pi_lock);
+
+ WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+ return 0;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int fastcall noinline __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock __IP_DECL__)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+ unsigned long flags;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ waiter.task = NULL;
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock __IP__)) {
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+ return 0;
+ }
+
+ BUG_ON(rt_mutex_owner(lock) == current);
+
+ set_task_state(current, state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout))
+ hrtimer_start(&timeout->timer, timeout->timer.expires,
+ HRTIMER_ABS);
+
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock __IP__))
+ break;
+
+ /*
+ * TASK_INTERRUPTIBLE checks for signals and
+ * timeout. Ignored otherwise.
+ */
+ if (unlikely(state == TASK_INTERRUPTIBLE)) {
+ /* Signal pending? */
+ if (signal_pending(current))
+ ret = -EINTR;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ if (ret)
+ break;
+ }
+
+ /*
+ * waiter.task is NULL the first time we come here and
+ * when we have been woken up by the previous owner
+ * but the lock got stolen by an higher prio task.
+ */
+ if (!waiter.task) {
+ ret = task_blocks_on_rt_mutex(lock, &waiter,
+ detect_deadlock __IP__);
+ if (ret == -EDEADLK)
+ break;
+ if (ret == -EBUSY)
+ continue;
+ }
+
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ debug_rt_mutex_print_deadlock(&waiter);
+
+ schedule();
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+ set_task_state(current, state);
+ }
+
+ set_task_state(current, TASK_RUNNING);
+
+ if (unlikely(waiter.task))
+ remove_waiter(lock, &waiter __IP__);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ /* Remove pending timer: */
+ if (unlikely(timeout && timeout->task))
+ hrtimer_cancel(&timeout->timer);
+
+ /*
+ * Readjust priority, when we did not get the lock. We might
+ * have been the pending owner and boosted. Since we did not
+ * take the lock, the PI boost has to go.
+ */
+ if (unlikely(ret))
+ rt_mutex_adjust_prio(current);
+
+ debug_rt_mutex_free_waiter(&waiter);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int fastcall
+rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+
+ if (likely(rt_mutex_owner(lock) != current)) {
+
+ ret = try_to_take_rt_mutex(lock __IP__);
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters
+ * bit. We might be the only waiter. Check if this
+ * needs to be cleaned up.
+ */
+ if (!ret)
+ fixup_rt_mutex_waiters(lock);
+ }
+
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void fastcall noinline __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+ return;
+ }
+
+ wakeup_next_waiter(lock);
+
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ /* Undo pi boosting if necessary: */
+ rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ int detect_deadlock,
+ int fastcall (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock __IP_DECL__))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout, int detect_deadlock,
+ int fastcall (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock __IP_DECL__))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int fastcall (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 1;
+ }
+ return slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+ void fastcall (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ rt_mutex_deadlock_account_unlock(current);
+ else
+ slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void fastcall __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ might_sleep();
+
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int fastcall __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
+ *
+ * @lock: the rt_mutex to be locked
+ * @timeout: timeout structure or NULL (no timeout)
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -ETIMEOUT when the timeout expired
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int fastcall
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int fastcall __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void fastcall __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void fastcall rt_mutex_destroy(struct rt_mutex *lock)
+{
+ WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ lock->owner = NULL;
+ spin_lock_init(&lock->wait_lock);
+ plist_head_init(&lock->wait_list);
+
+ debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
Index: linux-pi-futex.mm.q/kernel/rtmutex.h
===================================================================
--- /dev/null
+++ linux-pi-futex.mm.q/kernel/rtmutex.h
@@ -0,0 +1,28 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+#define rt_mutex_deadlock_check(l) (0)
+#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
+#define debug_rt_mutex_init_waiter(w) do { } while (0)
+#define debug_rt_mutex_free_waiter(w) do { } while (0)
+#define debug_rt_mutex_lock(l) do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
+#define debug_rt_mutex_unlock(l) do { } while (0)
+#define debug_rt_mutex_init(m, n) do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
+#define debug_rt_mutex_print_deadlock(w) do { } while (0)
+#define debug_rt_mutex_detect_deadlock(d) (d)
+#define debug_rt_mutex_reset_waiter(w) do { } while (0)
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [patch 05/10] PI-futex: rt-mutex core 2006-03-25 18:46 [patch 05/10] PI-futex: rt-mutex core Ingo Molnar @ 2006-03-26 5:35 ` Andrew Morton 2006-03-26 23:33 ` Ingo Molnar 2006-03-26 6:07 ` Andrew Morton 1 sibling, 1 reply; 12+ messages in thread From: Andrew Morton @ 2006-03-26 5:35 UTC (permalink / raw) To: Ingo Molnar; +Cc: linux-kernel, tglx, torvalds, arjan Ingo Molnar <mingo@elte.hu> wrote: > > --- linux-pi-futex.mm.q.orig/init/Kconfig > +++ linux-pi-futex.mm.q/init/Kconfig > @@ -361,9 +361,14 @@ config BASE_FULL > kernel data structures. This saves memory on small machines, > but may reduce performance. > > +config RT_MUTEXES > + boolean > + select PLIST > + > config FUTEX > bool "Enable futex support" if EMBEDDED > default y > + select RT_MUTEXES > help We can't turn these things off unless we also turn off futexes. There goes another 8.7 kbytes... ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [patch 05/10] PI-futex: rt-mutex core 2006-03-26 5:35 ` Andrew Morton @ 2006-03-26 23:33 ` Ingo Molnar 0 siblings, 0 replies; 12+ messages in thread From: Ingo Molnar @ 2006-03-26 23:33 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, tglx, torvalds, arjan * Andrew Morton <akpm@osdl.org> wrote: > Ingo Molnar <mingo@elte.hu> wrote: > > > > --- linux-pi-futex.mm.q.orig/init/Kconfig > > +++ linux-pi-futex.mm.q/init/Kconfig > > @@ -361,9 +361,14 @@ config BASE_FULL > > kernel data structures. This saves memory on small machines, > > but may reduce performance. > > > > +config RT_MUTEXES > > + boolean > > + select PLIST > > + > > config FUTEX > > bool "Enable futex support" if EMBEDDED > > default y > > + select RT_MUTEXES > > help > > We can't turn these things off unless we also turn off futexes. There > goes another 8.7 kbytes... do we want to make the futex API so splintered via .config? 8.7 kbytes on SMP boxes, a bit less on UP boxes. RAM-starving embedded platforms will want to turn off all things futexes i guess. Ingo ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [patch 05/10] PI-futex: rt-mutex core 2006-03-25 18:46 [patch 05/10] PI-futex: rt-mutex core Ingo Molnar 2006-03-26 5:35 ` Andrew Morton @ 2006-03-26 6:07 ` Andrew Morton 2006-03-26 16:03 ` [patch] PI-futex: -V2 Ingo Molnar 1 sibling, 1 reply; 12+ messages in thread From: Andrew Morton @ 2006-03-26 6:07 UTC (permalink / raw) To: Ingo Molnar; +Cc: linux-kernel, tglx, torvalds, arjan Ingo Molnar <mingo@elte.hu> wrote: > > From: Ingo Molnar <mingo@elte.hu> > > +/* > + * The rt_mutex structure > + * > + * @wait_lock: spinlock to protect the structure > + * @wait_list: pilist head to enqueue waiters in priority order > + * @owner: the mutex owner > + */ > +struct rt_mutex { > + spinlock_t wait_lock; > + struct plist_head wait_list; > + struct task_struct *owner; > +# ifdef CONFIG_DEBUG_RT_MUTEXES > + int save_state; > + struct list_head held_list; > + unsigned long acquire_ip; > + const char *name, *file; > + int line; > + void *magic; > +# endif > +}; The indented #-statments make some sense when we're using nested #ifs (although I tend to accidentally-on-purpose delete them). But the above ones aren't even nested.. > +extern void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name); > +extern void fastcall rt_mutex_destroy(struct rt_mutex *lock); > + > +extern void fastcall rt_mutex_lock(struct rt_mutex *lock); > +extern int fastcall rt_mutex_lock_interruptible(struct rt_mutex *lock, > + int detect_deadlock); > +extern int fastcall rt_mutex_timed_lock(struct rt_mutex *lock, > + struct hrtimer_sleeper *timeout, > + int detect_deadlock); > + > +extern int fastcall rt_mutex_trylock(struct rt_mutex *lock); > + > +extern void fastcall rt_mutex_unlock(struct rt_mutex *lock); Does fastcall actually do any good? Isn't CONFIG_REGPARM equivalent to that anyway? It's a bit of an eyesore. > +#ifdef CONFIG_RT_MUTEXES > +# define rt_mutex_init_task(p) \ > + do { \ > + spin_lock_init(&p->pi_lock); \ > + plist_head_init(&p->pi_waiters); \ > + p->pi_blocked_on = NULL; \ > + p->pi_locked_by = NULL; \ > + INIT_LIST_HEAD(&p->pi_lock_chain); \ > + } while (0) Somewhere in there is a C function struggling to escape. > Index: linux-pi-futex.mm.q/include/linux/rtmutex_internal.h Perhaps this could go in kernel/. If you think that's valuable. > +/* > + * Plist wrapper macros > + */ > +#define rt_mutex_has_waiters(lock) (!plist_head_empty(&lock->wait_list)) > + > +#define rt_mutex_top_waiter(lock) \ > +({ struct rt_mutex_waiter *__w = plist_first_entry(&lock->wait_list, \ > + struct rt_mutex_waiter, list_entry); \ > + BUG_ON(__w->lock != lock); \ > + __w; \ > +}) > + > +#define task_has_pi_waiters(task) (!plist_head_empty(&task->pi_waiters)) > + > +#define task_top_pi_waiter(task) \ > + plist_first_entry(&task->pi_waiters, struct rt_mutex_waiter, pi_list_entry) All of these can become C functions, yes? > +#define rt_mutex_owner(lock) \ > +({ \ > + typecheck(struct rt_mutex *,(lock)); \ > + ((struct task_struct *)((unsigned long)((lock)->owner) & ~RT_MUTEX_OWNER_MASKALL)); \ > +}) > + > +#define rt_mutex_real_owner(lock) \ > +({ \ > + typecheck(struct rt_mutex *,(lock)); \ > + ((struct task_struct *)((unsigned long)((lock)->owner) & ~RT_MUTEX_HAS_WAITERS)); \ > +}) > + > +#define rt_mutex_owner_pending(lock) \ > +({ \ > + typecheck(struct rt_mutex *,(lock)); \ > + ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); \ > +}) Bizarre. The `typecheck' thingies were added, I assume, because these macros really wanted to be C functions? > +static inline void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, > + unsigned long msk) > +{ > + unsigned long val = ((unsigned long) owner) | msk; > + > + if (rt_mutex_has_waiters(lock)) > + val |= RT_MUTEX_HAS_WAITERS; > + > + lock->owner = (struct task_struct *)(val); > +} Might be getting a bit large for inlining. > +/* > + * Lock the full boosting chain. > + * > + * If 'try' is set, we have to backout if we hit a owner who is > + * running its own pi chain operation. We go back and take the slow > + * path via the pi_conflicts_lock. > + */ > +static int lock_pi_chain(struct rt_mutex *act_lock, > + struct rt_mutex_waiter *waiter, > + struct list_head *lock_chain, > + int try, int detect_deadlock) > +{ > + struct task_struct *owner; > + struct rt_mutex *nextlock, *lock = act_lock; > + struct rt_mutex_waiter *nextwaiter; > + > + /* > + * Debugging might turn deadlock detection on, unconditionally: > + */ > + detect_deadlock = debug_rt_mutex_detect_deadlock(detect_deadlock); > + > + for (;;) { > + owner = rt_mutex_owner(lock); > + > + /* Check for circular dependencies */ > + if (unlikely(owner->pi_locked_by == current)) { > + debug_rt_mutex_deadlock(detect_deadlock, waiter, lock); > + return detect_deadlock ? -EDEADLK : 0; > + } > + > + while (!spin_trylock(&owner->pi_lock)) { > + /* > + * Owner runs its own chain. Go back and take > + * the slow path > + */ > + if (try && owner->pi_locked_by == owner) > + return -EBUSY; > + cpu_relax(); > + } > + > + BUG_ON(owner->pi_locked_by); > + owner->pi_locked_by = current; > + BUG_ON(!list_empty(&owner->pi_lock_chain)); > + list_add(&owner->pi_lock_chain, lock_chain); > + > + /* > + * When the owner is blocked on a lock, try to take > + * the lock: > + */ > + nextwaiter = owner->pi_blocked_on; > + > + /* End of chain? */ > + if (!nextwaiter) > + return 0; We return zero with the spinlock held? I guess that's the point of the whole function. > + nextlock = nextwaiter->lock; > + > + /* Check for circular dependencies: */ > + if (unlikely(nextlock == act_lock || > + rt_mutex_owner(nextlock) == current)) { > + debug_rt_mutex_deadlock(detect_deadlock, waiter, > + nextlock); > + list_del_init(&owner->pi_lock_chain); > + owner->pi_locked_by = NULL; > + spin_unlock(&owner->pi_lock); > + return detect_deadlock ? -EDEADLK : 0; > + } But here we can return zero without having locked anything. How does the caller know what locks are held? This function needs a better covering description, IMO. > + /* Try to get nextlock->wait_lock: */ > + if (unlikely(!spin_trylock(&nextlock->wait_lock))) { > + list_del_init(&owner->pi_lock_chain); > + owner->pi_locked_by = NULL; > + spin_unlock(&owner->pi_lock); > + cpu_relax(); > + continue; > + } All these trylocks and cpu_relaxes are a worry. > +/* > + * Do the priority (un)boosting along the chain: > + */ > +static void adjust_pi_chain(struct rt_mutex *lock, > + struct rt_mutex_waiter *waiter, > + struct rt_mutex_waiter *top_waiter, > + struct list_head *lock_chain) > +{ > + struct task_struct *owner = rt_mutex_owner(lock); > + struct list_head *curr = lock_chain->prev; > + > + for (;;) { > + if (top_waiter) > + plist_del(&top_waiter->pi_list_entry, > + &owner->pi_waiters); > + > + if (waiter && waiter == rt_mutex_top_waiter(lock)) { rt_mutex_top_waiter() can never return NULL, so the test for NULL could be removed. > +/* > + * Slow path lock function: > + */ > +static int fastcall noinline __sched > +rt_mutex_slowlock(struct rt_mutex *lock, int state, > + struct hrtimer_sleeper *timeout, > + int detect_deadlock __IP_DECL__) > +{ heh, fastcall slowpath. Why's it noinline? > + struct rt_mutex_waiter waiter; > + int ret = 0; > + unsigned long flags; > + > + debug_rt_mutex_init_waiter(&waiter); > + waiter.task = NULL; > + > + spin_lock_irqsave(&lock->wait_lock, flags); > + > + /* Try to acquire the lock again: */ > + if (try_to_take_rt_mutex(lock __IP__)) { > + spin_unlock_irqrestore(&lock->wait_lock, flags); > + return 0; > + } > + > + BUG_ON(rt_mutex_owner(lock) == current); > + > + set_task_state(current, state); set_current_state() (Several more below) > +void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name) > +{ > + lock->owner = NULL; > + spin_lock_init(&lock->wait_lock); > + plist_head_init(&lock->wait_list); > + > + debug_rt_mutex_init(lock, name); > +} > +EXPORT_SYMBOL_GPL(__rt_mutex_init); What's the export for? Anyway. Tricky-looking stuff. ^ permalink raw reply [flat|nested] 12+ messages in thread
* [patch] PI-futex: -V2 2006-03-26 6:07 ` Andrew Morton @ 2006-03-26 16:03 ` Ingo Molnar 2006-03-26 19:16 ` Andrew Morton 2006-03-26 23:16 ` [patch] PI-futex: -V3 Ingo Molnar 0 siblings, 2 replies; 12+ messages in thread From: Ingo Molnar @ 2006-03-26 16:03 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, tglx, torvalds, arjan, Esben Nielsen * Andrew Morton <akpm@osdl.org> wrote: > > +struct rt_mutex { > > + spinlock_t wait_lock; > > + struct plist_head wait_list; > > + struct task_struct *owner; > > +# ifdef CONFIG_DEBUG_RT_MUTEXES > The indented #-statments make some sense when we're using nested #ifs > (although I tend to accidentally-on-purpose delete them). But the > above ones aren't even nested.. fixed. > > +extern void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name); > > +extern void fastcall rt_mutex_destroy(struct rt_mutex *lock); > Does fastcall actually do any good? Isn't CONFIG_REGPARM equivalent > to that anyway? It's a bit of an eyesore. it's not needed - i removed all of them. (This code was started before REGPARM was reliable enough for distros to enable.) > > +#ifdef CONFIG_RT_MUTEXES > > +# define rt_mutex_init_task(p) \ > > + do { \ > > + spin_lock_init(&p->pi_lock); \ > > + plist_head_init(&p->pi_waiters); \ > > + p->pi_blocked_on = NULL; \ > > + p->pi_locked_by = NULL; \ > > + INIT_LIST_HEAD(&p->pi_lock_chain); \ > > + } while (0) > > Somewhere in there is a C function struggling to escape. ok, i've moved this to fork.c, into an inline function. > > Index: linux-pi-futex.mm.q/include/linux/rtmutex_internal.h > > Perhaps this could go in kernel/. If you think that's valuable. agreed, i moved it. (i also renamed it to rtmutex_common.h, and cleaned it up some more.) > > +#define task_top_pi_waiter(task) \ > > + plist_first_entry(&task->pi_waiters, struct rt_mutex_waiter, pi_list_entry) > > All of these can become C functions, yes? agreed, done. > > +#define rt_mutex_owner_pending(lock) \ > > +({ \ > > + typecheck(struct rt_mutex *,(lock)); \ > > + ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); \ > > +}) > > Bizarre. The `typecheck' thingies were added, I assume, because these > macros really wanted to be C functions? correct - i converted them to C inline functions. (These macros are the side-effect of them being in the generic rtmutex.h file for a long time - and that file gets included early on in the -rt tree, when various structures used by these macros are not defined yet.) > > +static inline void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, > > + unsigned long msk) > > +{ > > + unsigned long val = ((unsigned long) owner) | msk; > > + > > + if (rt_mutex_has_waiters(lock)) > > + val |= RT_MUTEX_HAS_WAITERS; > > + > > + lock->owner = (struct task_struct *)(val); > > +} > > Might be getting a bit large for inlining. agreed. I moved these functions to rtmutex.c and uninlined the bigger ones. > > + /* End of chain? */ > > + if (!nextwaiter) > > + return 0; > > We return zero with the spinlock held? I guess that's the point of the > whole function. correct. It's a "lock this chain" function call. > > + nextlock = nextwaiter->lock; > > + > > + /* Check for circular dependencies: */ > > + if (unlikely(nextlock == act_lock || > > + rt_mutex_owner(nextlock) == current)) { > > + debug_rt_mutex_deadlock(detect_deadlock, waiter, > > + nextlock); > > + list_del_init(&owner->pi_lock_chain); > > + owner->pi_locked_by = NULL; > > + spin_unlock(&owner->pi_lock); > > + return detect_deadlock ? -EDEADLK : 0; > > + } > > But here we can return zero without having locked anything. How does > the caller know what locks are held? this is a loop, and we might have taken other locks up to this point. The locks taken are the ones put into the p->pi_lock_chain list ... which list is walked at lock-release time. > This function needs a better covering description, IMO. ok, added some more comments explaining it better. > > + /* Try to get nextlock->wait_lock: */ > > + if (unlikely(!spin_trylock(&nextlock->wait_lock))) { > > + list_del_init(&owner->pi_lock_chain); > > + owner->pi_locked_by = NULL; > > + spin_unlock(&owner->pi_lock); > > + cpu_relax(); > > + continue; > > + } > > All these trylocks and cpu_relaxes are a worry. this is a typical lock-order solution we use in other places in the kernel: the common locking order is ->wait_lock + ->pi_lock - but in this case we have to take ->pi_lock first - hence the ->trylock, pi_lock-release, cpu_relax() and redoing of the pi_lock. > > +/* > > + * Do the priority (un)boosting along the chain: > > + */ > > +static void adjust_pi_chain(struct rt_mutex *lock, > > + struct rt_mutex_waiter *waiter, > > + struct rt_mutex_waiter *top_waiter, > > + struct list_head *lock_chain) > > +{ > > + struct task_struct *owner = rt_mutex_owner(lock); > > + struct list_head *curr = lock_chain->prev; > > + > > + for (;;) { > > + if (top_waiter) > > + plist_del(&top_waiter->pi_list_entry, > > + &owner->pi_waiters); > > + > > + if (waiter && waiter == rt_mutex_top_waiter(lock)) { > > rt_mutex_top_waiter() can never return NULL, so the test for NULL > could be removed. it might be NULL if adjust_pi_chain() is called from remove_waiter(), and next_waiter there is NULL (because !rt_mutex_has_waiters() after the removal of the current waiter). > > +/* > > + * Slow path lock function: > > + */ > > +static int fastcall noinline __sched > > +rt_mutex_slowlock(struct rt_mutex *lock, int state, > > + struct hrtimer_sleeper *timeout, > > + int detect_deadlock __IP_DECL__) > > +{ > > heh, fastcall slowpath. Why's it noinline? leftovers of cycle-level optimizations from the -rt tree. I've removed it, it's not needed anymore. > > + set_task_state(current, state); > > set_current_state() (Several more below) fixed. > > +EXPORT_SYMBOL_GPL(__rt_mutex_init); > > What's the export for? for the -rt tree. (latest -rt trees are based on this patch-queue and rt-mutex subsystem as-is) Right now the rt-tester code uses it - which isnt modular (but it could be made modular). Should i remove the export? find below a delta patch to the current patch-queue you have in -mm. Most of the patches in the -V2 queue were changed, so there are only two options: a full updated queue or to add this all-included update patch. this update includes the fixes for the things you noticed, plus a couple of more cleanups, and a fix from Esben Nielsen for the PI logic. I have merged the -V2 queue to the -rt tree [and have released 2.6.16-rt8] and have re-checked the PI code under load - it's all looking good. Ingo ------ From: Ingo Molnar <mingo@elte.hu> clean up the code as per Andrew's suggestions: - '# ifdef' => '#ifdef' - fastcall removal - lots of macro -> C function conversions - move rtmutex_internals.h to kernel/rtmutex_common.h - uninline two larger functions - remove noinline - explain locking better - set_task_state(current, state) => set_current_state(state) - fix the PI code (Esben Nielsen) Signed-off-by: Ingo Molnar <mingo@elte.hu> include/linux/rtmutex_internal.h | 187 --------------------------------------- linux/include/linux/rtmutex.h | 29 ++---- linux/kernel/fork.c | 11 ++ linux/kernel/futex.c | 3 linux/kernel/rtmutex-debug.c | 2 linux/kernel/rtmutex-debug.h | 2 linux/kernel/rtmutex.c | 116 ++++++++++++++++++++---- linux/kernel/rtmutex_common.h | 123 +++++++++++++++++++++++++ linux/kernel/sched.c | 2 9 files changed, 247 insertions(+), 228 deletions(-) Index: linux/include/linux/rtmutex.h =================================================================== --- linux.orig/include/linux/rtmutex.h +++ linux/include/linux/rtmutex.h @@ -27,14 +27,14 @@ struct rt_mutex { spinlock_t wait_lock; struct plist_head wait_list; struct task_struct *owner; -# ifdef CONFIG_DEBUG_RT_MUTEXES +#ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; struct list_head held_list; unsigned long acquire_ip; const char *name, *file; int line; void *magic; -# endif +#endif }; struct rt_mutex_waiter; @@ -79,40 +79,31 @@ struct hrtimer_sleeper; * * Returns 1 if the mutex is locked, 0 if unlocked. */ -static inline int fastcall rt_mutex_is_locked(struct rt_mutex *lock) +static inline int rt_mutex_is_locked(struct rt_mutex *lock) { return lock->owner != NULL; } -extern void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name); -extern void fastcall rt_mutex_destroy(struct rt_mutex *lock); +extern void __rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void rt_mutex_destroy(struct rt_mutex *lock); -extern void fastcall rt_mutex_lock(struct rt_mutex *lock); -extern int fastcall rt_mutex_lock_interruptible(struct rt_mutex *lock, +extern void rt_mutex_lock(struct rt_mutex *lock); +extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, int detect_deadlock); -extern int fastcall rt_mutex_timed_lock(struct rt_mutex *lock, +extern int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock); -extern int fastcall rt_mutex_trylock(struct rt_mutex *lock); +extern int rt_mutex_trylock(struct rt_mutex *lock); -extern void fastcall rt_mutex_unlock(struct rt_mutex *lock); +extern void rt_mutex_unlock(struct rt_mutex *lock); #ifdef CONFIG_RT_MUTEXES -# define rt_mutex_init_task(p) \ - do { \ - spin_lock_init(&p->pi_lock); \ - plist_head_init(&p->pi_waiters); \ - p->pi_blocked_on = NULL; \ - p->pi_locked_by = NULL; \ - INIT_LIST_HEAD(&p->pi_lock_chain); \ - } while (0) # define INIT_RT_MUTEXES(tsk) \ .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ .pi_lock = SPIN_LOCK_UNLOCKED, \ .pi_lock_chain = LIST_HEAD_INIT(tsk.pi_lock_chain), #else -# define rt_mutex_init_task(p) do { } while (0) # define INIT_RT_MUTEXES(tsk) #endif Index: linux/include/linux/rtmutex_internal.h =================================================================== --- linux.orig/include/linux/rtmutex_internal.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * RT Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This file contains the private data structure and API definitions. - */ - -#ifndef __LINUX_RT_MUTEX_INTERNAL_H -#define __LINUX_RT_MUTEX_INTERNAL_H - -#include <linux/rtmutex.h> - -/* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* - * This is the control structure for tasks blocked on a rt_mutex, - * which is allocated on the kernel stack on of the blocked task. - * - * @list_entry: pi node to enqueue into the mutex waiters list - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list - * @task: task reference to the blocked task - */ -struct rt_mutex_waiter { - struct plist_node list_entry; - struct plist_node pi_list_entry; - struct task_struct *task; - struct rt_mutex *lock; -#ifdef CONFIG_DEBUG_RT_MUTEXES - unsigned long ip; - pid_t deadlock_task_pid; - struct rt_mutex *deadlock_lock; -#endif -}; - -/* - * Plist wrapper macros - */ -#define rt_mutex_has_waiters(lock) (!plist_head_empty(&lock->wait_list)) - -#define rt_mutex_top_waiter(lock) \ -({ struct rt_mutex_waiter *__w = plist_first_entry(&lock->wait_list, \ - struct rt_mutex_waiter, list_entry); \ - BUG_ON(__w->lock != lock); \ - __w; \ -}) - -#define task_has_pi_waiters(task) (!plist_head_empty(&task->pi_waiters)) - -#define task_top_pi_waiter(task) \ - plist_first_entry(&task->pi_waiters, struct rt_mutex_waiter, pi_list_entry) - -/* - * lock->owner state tracking: - * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. - * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 invalid state - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. - * - * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. - */ -#define RT_MUTEX_OWNER_PENDING 1UL -#define RT_MUTEX_HAS_WAITERS 2UL -#define RT_MUTEX_OWNER_MASKALL 3UL - -#define rt_mutex_owner(lock) \ -({ \ - typecheck(struct rt_mutex *,(lock)); \ - ((struct task_struct *)((unsigned long)((lock)->owner) & ~RT_MUTEX_OWNER_MASKALL)); \ -}) - -#define rt_mutex_real_owner(lock) \ -({ \ - typecheck(struct rt_mutex *,(lock)); \ - ((struct task_struct *)((unsigned long)((lock)->owner) & ~RT_MUTEX_HAS_WAITERS)); \ -}) - -#define rt_mutex_owner_pending(lock) \ -({ \ - typecheck(struct rt_mutex *,(lock)); \ - ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); \ -}) - -static inline void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long msk) -{ - unsigned long val = ((unsigned long) owner) | msk; - - if (rt_mutex_has_waiters(lock)) - val |= RT_MUTEX_HAS_WAITERS; - - lock->owner = (struct task_struct *)(val); -} - -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner; - - owner = ((unsigned long) lock->owner) & ~RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)(owner); -} - -static inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); -} - -/* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) - -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) - -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner, *p = (unsigned long *) &lock->owner; - - do { - owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} - -#else - -# define rt_mutex_cmpxchg(l,c,n) (0) - -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner = ((unsigned long) lock->owner)| RT_MUTEX_HAS_WAITERS; - - lock->owner = (struct task_struct *) owner; -} - -#endif - -/* - * PI-futex support (proxy locking functions, etc.): - */ -extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); -#endif Index: linux/kernel/fork.c =================================================================== --- linux.orig/kernel/fork.c +++ linux/kernel/fork.c @@ -922,6 +922,17 @@ asmlinkage long sys_set_tid_address(int return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ +#ifdef CONFIG_RT_MUTEXES + spin_lock_init(&p->pi_lock); + plist_head_init(&p->pi_waiters); + p->pi_blocked_on = NULL; + p->pi_locked_by = NULL; + INIT_LIST_HEAD(&p->pi_lock_chain); +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. Index: linux/kernel/futex.c =================================================================== --- linux.orig/kernel/futex.c +++ linux/kernel/futex.c @@ -48,9 +48,10 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> -#include <linux/rtmutex_internal.h> #include <asm/futex.h> +#include "rtmutex_common.h" + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* Index: linux/kernel/rtmutex-debug.c =================================================================== --- linux.orig/kernel/rtmutex-debug.c +++ linux/kernel/rtmutex-debug.c @@ -27,7 +27,7 @@ #include <linux/plist.h> #include <linux/fs.h> -#include <linux/rtmutex_internal.h> +#include "rtmutex_common.h" #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" Index: linux/kernel/rtmutex-debug.h =================================================================== --- linux.orig/kernel/rtmutex-debug.h +++ linux/kernel/rtmutex-debug.h @@ -9,8 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -#include <linux/rtmutex_internal.h> - #define __IP_DECL__ , unsigned long ip #define __IP__ , ip #define __RET_IP__ , (unsigned long)__builtin_return_address(0) Index: linux/kernel/rtmutex.c =================================================================== --- linux.orig/kernel/rtmutex.c +++ linux/kernel/rtmutex.c @@ -12,7 +12,7 @@ #include <linux/sched.h> #include <linux/timer.h> -#include <linux/rtmutex_internal.h> +#include "rtmutex_common.h" #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" @@ -21,6 +21,80 @@ #endif /* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 + * are used to keep track of the "owner is pending" and "lock has + * waiters" state. + * + * owner bit1 bit0 + * NULL 0 0 lock is free (fast acquire possible) + * NULL 0 1 invalid state + * NULL 1 0 invalid state + * NULL 1 1 invalid state + * taskpointer 0 0 lock is held (fast release possible) + * taskpointer 0 1 task is pending owner + * taskpointer 1 0 lock is held and has waiters + * taskpointer 1 1 task is pending owner and lock has more waiters + * + * Pending ownership is assigned to the top (highest priority) + * waiter of the lock, when the lock is released. The thread is woken + * up and can now take the lock. Until the lock is taken (bit 0 + * cleared) a competing higher priority thread can steal the lock + * which puts the woken up thread back on the waiters list. + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 and 1 of lock->owner are 0. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* * Calculate task priority from the waiter list priority * * Return task->normal_prio when the waiter list is empty or when @@ -87,6 +161,9 @@ static DEFINE_SPINLOCK(pi_conflicts_lock * If 'try' is set, we have to backout if we hit a owner who is * running its own pi chain operation. We go back and take the slow * path via the pi_conflicts_lock. + * + * We put all held locks into a list, via ->pi_lock_chain, and walk + * this list at unlock_pi_chain() time. */ static int lock_pi_chain(struct rt_mutex *act_lock, struct rt_mutex_waiter *waiter, @@ -222,10 +299,15 @@ static void adjust_pi_chain(struct rt_mu plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - if (waiter && waiter == rt_mutex_top_waiter(lock)) { + if (waiter) waiter->pi_list_entry.prio = waiter->task->prio; - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&top_waiter->pi_list_entry, + &owner->pi_waiters); } + __rt_mutex_adjust_prio(owner); waiter = owner->pi_blocked_on; @@ -605,7 +687,7 @@ static int remove_waiter(struct rt_mutex /* * Slow path lock function: */ -static int fastcall noinline __sched +static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock __IP_DECL__) @@ -711,7 +793,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, /* * Slow path try-lock function: */ -static inline int fastcall +static inline int rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) { unsigned long flags; @@ -739,7 +821,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo /* * Slow path to release a rt-mutex: */ -static void fastcall noinline __sched +static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { unsigned long flags; @@ -773,7 +855,7 @@ rt_mutex_slowunlock(struct rt_mutex *loc static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, int detect_deadlock, - int fastcall (*slowfn)(struct rt_mutex *lock, int state, + int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock __IP_DECL__)) { @@ -787,7 +869,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, static inline int rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock, - int fastcall (*slowfn)(struct rt_mutex *lock, int state, + int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock __IP_DECL__)) { @@ -800,7 +882,7 @@ rt_mutex_timed_fastlock(struct rt_mutex static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, - int fastcall (*slowfn)(struct rt_mutex *lock __IP_DECL__)) + int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) { if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); @@ -811,7 +893,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lo static inline void rt_mutex_fastunlock(struct rt_mutex *lock, - void fastcall (*slowfn)(struct rt_mutex *lock)) + void (*slowfn)(struct rt_mutex *lock)) { if (likely(rt_mutex_cmpxchg(lock, current, NULL))) rt_mutex_deadlock_account_unlock(current); @@ -824,7 +906,7 @@ rt_mutex_fastunlock(struct rt_mutex *loc * * @lock: the rt_mutex to be locked */ -void fastcall __sched rt_mutex_lock(struct rt_mutex *lock) +void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); @@ -843,7 +925,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); * -EINTR when interrupted by a signal * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ -int fastcall __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, int detect_deadlock) { might_sleep(); @@ -868,7 +950,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interrup * -ETIMEOUT when the timeout expired * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ -int fastcall +int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock) { @@ -887,7 +969,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); * * Returns 1 on success and 0 on contention */ -int fastcall __sched rt_mutex_trylock(struct rt_mutex *lock) +int __sched rt_mutex_trylock(struct rt_mutex *lock) { return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); } @@ -898,7 +980,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); * * @lock: the rt_mutex to be unlocked */ -void fastcall __sched rt_mutex_unlock(struct rt_mutex *lock) +void __sched rt_mutex_unlock(struct rt_mutex *lock) { rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } @@ -912,7 +994,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); * use of the mutex is forbidden. The mutex must not be locked when * this function is called. */ -void fastcall rt_mutex_destroy(struct rt_mutex *lock) +void rt_mutex_destroy(struct rt_mutex *lock) { WARN_ON(rt_mutex_is_locked(lock)); #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -931,7 +1013,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); * * Initializing of a locked rt lock is not allowed */ -void fastcall __rt_mutex_init(struct rt_mutex *lock, const char *name) +void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; spin_lock_init(&lock->wait_lock); Index: linux/kernel/rtmutex_common.h =================================================================== --- /dev/null +++ linux/kernel/rtmutex_common.h @@ -0,0 +1,123 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include <linux/rtmutex.h> + +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock) \ + do { \ + if (!(current->flags & PF_MUTEX_TESTER)) \ + schedule(); \ + else \ + schedule_rt_mutex_test(_lock); \ + } while (0) + +#else +# define schedule_rt_mutex(_lock) schedule() +#endif + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + pid_t deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_OWNER_PENDING 1UL +#define RT_MUTEX_HAS_WAITERS 2UL +#define RT_MUTEX_OWNER_MASKALL 3UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)((lock)->owner) & ~RT_MUTEX_OWNER_MASKALL); +} + +static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)((lock)->owner) & ~RT_MUTEX_HAS_WAITERS); +} + +static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) +{ + return ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); +} + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); +#endif Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -3895,7 +3895,7 @@ static void __setscheduler(struct task_s p->rt_priority = prio; p->normal_prio = normal_prio(p); - /* we are holding p->pi_list already */ + /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); /* * SCHED_BATCH tasks are treated as perpetual CPU hogs: ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [patch] PI-futex: -V2 2006-03-26 16:03 ` [patch] PI-futex: -V2 Ingo Molnar @ 2006-03-26 19:16 ` Andrew Morton 2006-03-26 23:30 ` Ingo Molnar 2006-03-26 23:16 ` [patch] PI-futex: -V3 Ingo Molnar 1 sibling, 1 reply; 12+ messages in thread From: Andrew Morton @ 2006-03-26 19:16 UTC (permalink / raw) To: Ingo Molnar; +Cc: linux-kernel, tglx, torvalds, arjan, simlo Ingo Molnar <mingo@elte.hu> wrote: > > > > + for (;;) { > > > + if (top_waiter) > > > + plist_del(&top_waiter->pi_list_entry, > > > + &owner->pi_waiters); > > > + > > > + if (waiter && waiter == rt_mutex_top_waiter(lock)) { > > > > rt_mutex_top_waiter() can never return NULL, so the test for NULL > > could be removed. > > it might be NULL if adjust_pi_chain() is called from remove_waiter(), > and next_waiter there is NULL (because !rt_mutex_has_waiters() after the > removal of the current waiter). Yes, `waiter' might be NULL. But rt_mutex_top_waiter() will never return NULL. So it might be possible to just do if (waiter == rt_mutex_top_waiter(lock)) Which might actually be less efficient, and more obscure. Just pointing it out. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [patch] PI-futex: -V2 2006-03-26 19:16 ` Andrew Morton @ 2006-03-26 23:30 ` Ingo Molnar 0 siblings, 0 replies; 12+ messages in thread From: Ingo Molnar @ 2006-03-26 23:30 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, tglx, torvalds, arjan, simlo * Andrew Morton <akpm@osdl.org> wrote: > Ingo Molnar <mingo@elte.hu> wrote: > > > > > > + for (;;) { > > > > + if (top_waiter) > > > > + plist_del(&top_waiter->pi_list_entry, > > > > + &owner->pi_waiters); > > > > + > > > > + if (waiter && waiter == rt_mutex_top_waiter(lock)) { > > > > > > rt_mutex_top_waiter() can never return NULL, so the test for NULL > > > could be removed. > > > > it might be NULL if adjust_pi_chain() is called from remove_waiter(), > > and next_waiter there is NULL (because !rt_mutex_has_waiters() after the > > removal of the current waiter). > > Yes, `waiter' might be NULL. But rt_mutex_top_waiter() will never > return NULL. So it might be possible to just do > > if (waiter == rt_mutex_top_waiter(lock)) ah, indeed, you are right. > Which might actually be less efficient, and more obscure. Just > pointing it out. ok, i left it as-is for now. Ingo ^ permalink raw reply [flat|nested] 12+ messages in thread
* [patch] PI-futex: -V3 2006-03-26 16:03 ` [patch] PI-futex: -V2 Ingo Molnar 2006-03-26 19:16 ` Andrew Morton @ 2006-03-26 23:16 ` Ingo Molnar 2006-03-28 10:17 ` [patch-mm2] PI-futex: fix timeout race Thomas Gleixner 2006-03-31 19:14 ` [patch] PI-futex patchset: -V4 Ingo Molnar 1 sibling, 2 replies; 12+ messages in thread From: Ingo Molnar @ 2006-03-26 23:16 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, tglx, torvalds, arjan, Esben Nielsen this is version -V3 of the PI-futex patchset (ontop of current -mm plus the -V2 delta patch). Changes since -V2: - do the -> set_current_state() cleanups for real - make plist_add() more readable - whitespace cleanups in kernel/futex.c - further cleanups in rtmutex_common.h - fix cornercase race in the PI-locking code (it triggered in the -rt tree) Signed-off-by: Ingo Molnar <mingo@elte.hu> ---- kernel/futex.c | 4 kernel/rtmutex.c | 245 +++++++++++++++++++++++++----------------------- kernel/rtmutex_common.h | 6 - lib/plist.c | 3 4 files changed, 136 insertions(+), 122 deletions(-) Index: linux-pi-futex.mm.q/kernel/futex.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/futex.c +++ linux-pi-futex.mm.q/kernel/futex.c @@ -425,7 +425,7 @@ void exit_pi_state_list(struct task_stru */ spin_lock(&curr->pi_lock); while (!list_empty(head)) { - + next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; @@ -439,7 +439,7 @@ void exit_pi_state_list(struct task_stru spin_unlock(&hb->lock); continue; } - + list_del_init(&pi_state->list); WARN_ON(pi_state->owner != curr); Index: linux-pi-futex.mm.q/kernel/rtmutex.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/rtmutex.c +++ linux-pi-futex.mm.q/kernel/rtmutex.c @@ -173,11 +173,12 @@ static int lock_pi_chain(struct rt_mutex struct task_struct *owner; struct rt_mutex *nextlock, *lock = act_lock; struct rt_mutex_waiter *nextwaiter; + int deadlock_detect; /* * Debugging might turn deadlock detection on, unconditionally: */ - detect_deadlock = debug_rt_mutex_detect_deadlock(detect_deadlock); + deadlock_detect = debug_rt_mutex_detect_deadlock(detect_deadlock); for (;;) { owner = rt_mutex_owner(lock); @@ -185,7 +186,7 @@ static int lock_pi_chain(struct rt_mutex /* Check for circular dependencies */ if (unlikely(owner->pi_locked_by == current)) { debug_rt_mutex_deadlock(detect_deadlock, waiter, lock); - return detect_deadlock ? -EDEADLK : 0; + return detect_deadlock ? -EDEADLK : 1; } while (!spin_trylock(&owner->pi_lock)) { @@ -211,7 +212,7 @@ static int lock_pi_chain(struct rt_mutex /* End of chain? */ if (!nextwaiter) - return 0; + return 1; nextlock = nextwaiter->lock; @@ -223,7 +224,7 @@ static int lock_pi_chain(struct rt_mutex list_del_init(&owner->pi_lock_chain); owner->pi_locked_by = NULL; spin_unlock(&owner->pi_lock); - return detect_deadlock ? -EDEADLK : 0; + return detect_deadlock ? -EDEADLK : 1; } /* Try to get nextlock->wait_lock: */ @@ -242,7 +243,7 @@ static int lock_pi_chain(struct rt_mutex * for userspace locks), we have to walk the full chain * unconditionally. */ - if (detect_deadlock) + if (deadlock_detect) continue; /* @@ -255,11 +256,11 @@ static int lock_pi_chain(struct rt_mutex /* If the top waiter has higher priority, stop: */ if (rt_mutex_top_waiter(lock)->list_entry.prio <= waiter->list_entry.prio) - return 0; + return 1; } else { /* If nextwaiter is not the top waiter, stop: */ if (rt_mutex_top_waiter(lock) != nextwaiter) - return 0; + return 1; } } } @@ -340,103 +341,6 @@ static void adjust_pi_chain(struct rt_mu } /* - * Task blocks on lock. - * - * Prepare waiter and potentially propagate our priority into the pi chain. - * - * This must be called with lock->wait_lock held. - */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - int detect_deadlock __IP_DECL__) -{ - int res = 0; - struct rt_mutex_waiter *top_waiter = waiter; - LIST_HEAD(lock_chain); - - waiter->task = current; - waiter->lock = lock; - debug_rt_mutex_reset_waiter(waiter); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); - - /* Get the top priority waiter of the lock: */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - current->pi_blocked_on = waiter; - - /* - * Call adjust_prio_chain, when waiter is the new top waiter - * or when deadlock detection is requested: - */ - if (waiter != rt_mutex_top_waiter(lock) && - !debug_rt_mutex_detect_deadlock(detect_deadlock)) - goto out; - - /* Try to lock the full chain: */ - res = lock_pi_chain(lock, waiter, &lock_chain, 1, detect_deadlock); - - if (likely(!res)) - adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); - - /* Common case: we managed to lock it: */ - if (res != -EBUSY) - goto out_unlock; - - /* Rare case: we hit some other task running a pi chain operation: */ - unlock_pi_chain(&lock_chain); - - plist_del(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = NULL; - current->pi_locked_by = NULL; - spin_unlock(¤t->pi_lock); - fixup_rt_mutex_waiters(lock); - - spin_unlock(&lock->wait_lock); - - spin_lock(&pi_conflicts_lock); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; - spin_lock(&lock->wait_lock); - if (!rt_mutex_owner(lock)) { - waiter->task = NULL; - spin_unlock(&pi_conflicts_lock); - goto out; - } - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); - - /* Get the top priority waiter of the lock: */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - current->pi_blocked_on = waiter; - - /* Lock the full chain: */ - res = lock_pi_chain(lock, waiter, &lock_chain, 0, detect_deadlock); - - /* Drop the conflicts lock before adjusting: */ - spin_unlock(&pi_conflicts_lock); - - if (likely(!res)) - adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); - - out_unlock: - unlock_pi_chain(&lock_chain); - out: - current->pi_locked_by = NULL; - spin_unlock(¤t->pi_lock); - return res; -} - -/* * Optimization: check if we can steal the lock from the * assigned pending owner [which might not have taken the * lock yet]: @@ -542,6 +446,117 @@ static int try_to_take_rt_mutex(struct r } /* + * Task blocks on lock. + * + * Prepare waiter and potentially propagate our priority into the pi chain. + * + * This must be called with lock->wait_lock held. + * return values: 1: waiter queued, 0: got the lock, + * -EDEADLK: deadlock detected. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + LIST_HEAD(lock_chain); + int res = 1; + + waiter->task = current; + waiter->lock = lock; + debug_rt_mutex_reset_waiter(waiter); + + spin_lock(¤t->pi_lock); + current->pi_locked_by = current; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter of the lock: */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + /* + * Call adjust_prio_chain, when waiter is the new top waiter + * or when deadlock detection is requested: + */ + if (waiter != rt_mutex_top_waiter(lock) && + !debug_rt_mutex_detect_deadlock(detect_deadlock)) + goto out_unlock_pi; + + /* Try to lock the full chain: */ + res = lock_pi_chain(lock, waiter, &lock_chain, 1, detect_deadlock); + + if (likely(res == 1)) + adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); + + /* Common case: we managed to lock it: */ + if (res != -EBUSY) + goto out_unlock_chain_pi; + + /* Rare case: we hit some other task running a pi chain operation: */ + unlock_pi_chain(&lock_chain); + + plist_del(&waiter->list_entry, &lock->wait_list); + current->pi_blocked_on = NULL; + current->pi_locked_by = NULL; + spin_unlock(¤t->pi_lock); + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Here we have dropped all locks, and take the global + * pi_conflicts_lock. We have to redo all the work, no + * previous information about the lock is valid anymore: + */ + spin_lock(&pi_conflicts_lock); + + spin_lock(&lock->wait_lock); + if (try_to_take_rt_mutex(lock __IP__)) { + /* + * Rare race: against all odds we got the lock. + */ + res = 0; + goto out; + } + + WARN_ON(!rt_mutex_owner(lock) || rt_mutex_owner(lock) == current); + + spin_lock(¤t->pi_lock); + current->pi_locked_by = current; + + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter of the lock: */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + /* Lock the full chain: */ + res = lock_pi_chain(lock, waiter, &lock_chain, 0, detect_deadlock); + + /* Drop the conflicts lock before adjusting: */ + spin_unlock(&pi_conflicts_lock); + + if (likely(res == 1)) + adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); + + out_unlock_chain_pi: + unlock_pi_chain(&lock_chain); + out_unlock_pi: + current->pi_locked_by = NULL; + spin_unlock(¤t->pi_lock); + out: + return res; +} + +/* * Wake up the next waiter on the lock. * * Remove the top waiter from the current tasks waiter list and from @@ -641,11 +656,11 @@ static int remove_waiter(struct rt_mutex spin_lock(&pi_conflicts_lock); + spin_lock(&lock->wait_lock); + spin_lock(¤t->pi_lock); current->pi_locked_by = current; - spin_lock(&lock->wait_lock); - /* We might have been woken up: */ if (!waiter->task) { spin_unlock(&pi_conflicts_lock); @@ -709,7 +724,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, BUG_ON(rt_mutex_owner(lock) == current); - set_task_state(current, state); + set_current_state(state); /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) @@ -743,10 +758,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (!waiter.task) { ret = task_blocks_on_rt_mutex(lock, &waiter, detect_deadlock __IP__); - if (ret == -EDEADLK) + /* got the lock or deadlock: */ + if (ret == 0 || ret == -EDEADLK) break; - if (ret == -EBUSY) - continue; + ret = 0; } spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -757,10 +772,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, schedule_rt_mutex(lock); spin_lock_irqsave(&lock->wait_lock, flags); - set_task_state(current, state); + set_current_state(state); } - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) remove_waiter(lock, &waiter __IP__); @@ -806,11 +821,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lo ret = try_to_take_rt_mutex(lock __IP__); /* * try_to_take_rt_mutex() sets the lock waiters - * bit. We might be the only waiter. Check if this - * needs to be cleaned up. + * bit unconditionally. Clean this up. */ - if (!ret) - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock); } spin_unlock_irqrestore(&lock->wait_lock, flags); Index: linux-pi-futex.mm.q/kernel/rtmutex_common.h =================================================================== --- linux-pi-futex.mm.q.orig/kernel/rtmutex_common.h +++ linux-pi-futex.mm.q/kernel/rtmutex_common.h @@ -98,18 +98,18 @@ task_top_pi_waiter(struct task_struct *p static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { return (struct task_struct *) - ((unsigned long)((lock)->owner) & ~RT_MUTEX_OWNER_MASKALL); + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) { return (struct task_struct *) - ((unsigned long)((lock)->owner) & ~RT_MUTEX_HAS_WAITERS); + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) { - return ((unsigned long)((lock)->owner) & RT_MUTEX_OWNER_PENDING); + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; } /* Index: linux-pi-futex.mm.q/lib/plist.c =================================================================== --- linux-pi-futex.mm.q.orig/lib/plist.c +++ linux-pi-futex.mm.q/lib/plist.c @@ -38,7 +38,7 @@ void plist_add(struct plist_node *node, WARN_ON(!plist_node_empty(node)); - list_for_each_entry(iter, &head->prio_list, plist.prio_list) + list_for_each_entry(iter, &head->prio_list, plist.prio_list) { if (node->prio < iter->prio) goto lt_prio; else if (node->prio == iter->prio) { @@ -46,6 +46,7 @@ void plist_add(struct plist_node *node, struct plist_node, plist.prio_list); goto eq_prio; } + } lt_prio: list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); ^ permalink raw reply [flat|nested] 12+ messages in thread
* [patch-mm2] PI-futex: fix timeout race 2006-03-26 23:16 ` [patch] PI-futex: -V3 Ingo Molnar @ 2006-03-28 10:17 ` Thomas Gleixner 2006-03-31 19:14 ` [patch] PI-futex patchset: -V4 Ingo Molnar 1 sibling, 0 replies; 12+ messages in thread From: Thomas Gleixner @ 2006-03-28 10:17 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel The futex code has consequently the same timeout race as generic sleeper based nanosleep. Call hrtimer_cancel() unconditionally. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Index: linux-2.6.16-mm2/kernel/rtmutex.c =================================================================== --- linux-2.6.16-mm2.orig/kernel/rtmutex.c +++ linux-2.6.16-mm2/kernel/rtmutex.c @@ -789,7 +789,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ - if (unlikely(timeout && timeout->task)) + if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); /* ^ permalink raw reply [flat|nested] 12+ messages in thread
* [patch] PI-futex patchset: -V4 2006-03-26 23:16 ` [patch] PI-futex: -V3 Ingo Molnar 2006-03-28 10:17 ` [patch-mm2] PI-futex: fix timeout race Thomas Gleixner @ 2006-03-31 19:14 ` Ingo Molnar 2006-03-31 19:23 ` Ingo Molnar 2006-04-01 0:19 ` Peter Williams 1 sibling, 2 replies; 12+ messages in thread From: Ingo Molnar @ 2006-03-31 19:14 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, tglx, torvalds, Esben Nielsen this is version -V4 of the PI-futex patchset (ontop of current -mm2, which includes -V3.) A clean queue of split-up patches can be found at: http://redhat.com/~mingo/PI-futex-patches/PI-futex-patches-V4.tar.gz the -V4 codebase has been tested on the glibc code (all testcases pass) and under load as well. (The -V4 code is included in the 2.6.16-rt12 code as well that i released earlier today.) Changes since -V3: - added Esben Nielsen's PI locking code, Thomas Gleixner made it work in cornercases and under load. This is significantly simpler (it removes 50 lines of code from rtmutex.c). The main difference is that instead of holding all locks along a dependency chain, this code propagates PI priorities (and detects deadlocks) by holding at most two locks at once, and by being preemptible between such steps. - Jakub Jelinek did a detailed review of the new futex code and found some new races, which Thomas Gleixner fixed. - to fix a pthread_mutex_trylock() related race, FUTEX_TRYLOCK_PI has been added (Thomas Gleixner) - documentation fixes based on feedback from Tim Bird - added Documentation/pi-futex.txt (in addition to rt-mutex.txt) - added the plist debugging patch (which was part of -rt but wasnt part of the pi-futex queue before). This caught a couple of SMP bugs in the past. - implemented more scalable held-locks debugging - it's now a per-task list of held locks, instead of a global list. This is similarly effective to the global list, but much more scalable. (This approach will also be added to the stock kernel/mutex.c code.) - do not fiddle with irq flags in rtmutex.c - it's not needed. - clone/fork fix: do not let parent's potential PI priority 'leak' into child threads or processes. - added /proc/sys/kernel/max_lock_depth with a default limit of 1024, to limit the amount of deadlock-checking the kernel will do. - small enhancement to the t3-l1-pi-signal.tst testcase. Signed-off-by: Ingo Molnar <mingo@elte.hu> -- Documentation/rtmutex.txt | 60 - linux-pi-futex.mm.q/Documentation/pi-futex.txt | 121 +++ linux-pi-futex.mm.q/Documentation/robust-futexes.txt | 2 linux-pi-futex.mm.q/Documentation/rt-mutex.txt | 74 + linux-pi-futex.mm.q/include/linux/futex.h | 1 linux-pi-futex.mm.q/include/linux/rtmutex.h | 16 linux-pi-futex.mm.q/include/linux/sched.h | 9 linux-pi-futex.mm.q/include/linux/sysctl.h | 1 linux-pi-futex.mm.q/kernel/fork.c | 8 linux-pi-futex.mm.q/kernel/futex.c | 88 +- linux-pi-futex.mm.q/kernel/rtmutex-debug.c | 186 ++-- linux-pi-futex.mm.q/kernel/rtmutex-debug.h | 9 linux-pi-futex.mm.q/kernel/rtmutex.c | 560 +++++--------- linux-pi-futex.mm.q/kernel/rtmutex.h | 3 linux-pi-futex.mm.q/kernel/sched.c | 8 linux-pi-futex.mm.q/kernel/sysctl.c | 15 linux-pi-futex.mm.q/scripts/rt-tester/t3-l1-pi-signal.tst | 3 17 files changed, 631 insertions(+), 533 deletions(-) Index: linux-pi-futex.mm.q/Documentation/pi-futex.txt =================================================================== --- /dev/null +++ linux-pi-futex.mm.q/Documentation/pi-futex.txt @@ -0,0 +1,121 @@ +Lightweight PI-futexes +---------------------- + +We are calling them lightweight for 3 reasons: + + - in the user-space fastpath a PI-enabled futex involves no kernel work + (or any other PI complexity) at all. No registration, no extra kernel + calls - just pure fast atomic ops in userspace. + + - even in the slowpath, the system call and scheduling pattern is very + similar to normal futexes. + + - the in-kernel PI implementation is streamlined around the mutex + abstraction, with strict rules that keep the implementation + relatively simple: only a single owner may own a lock (i.e. no + read-write lock support), only the owner may unlock a lock, no + recursive locking, etc. + +Priority Inheritance - why? +--------------------------- + +The short reply: user-space PI helps achieving/improving determinism for +user-space applications. In the best-case, it can help achieve +determinism and well-bound latencies. Even in the worst-case, PI will +improve the statistical distribution of locking related application +delays. + +The longer reply: +----------------- + +Firstly, sharing locks between multiple tasks is a common programming +technique that often cannot be replaced with lockless algorithms. As we +can see it in the kernel [which is a quite complex program in itself], +lockless structures are rather the exception than the norm - the current +ratio of lockless vs. locky code for shared data structures is somewhere +between 1:10 and 1:100. Lockless is hard, and the complexity of lockless +algorithms often endangers to ability to do robust reviews of said code. +I.e. critical RT apps often choose lock structures to protect critical +data structures, instead of lockless algorithms. Furthermore, there are +cases (like shared hardware, or other resource limits) where lockless +access is mathematically impossible. + +Media players (such as Jack) are an example of reasonable application +design with multiple tasks (with multiple priority levels) sharing +short-held locks: for example, a highprio audio playback thread is +combined with medium-prio construct-audio-data threads and low-prio +display-colory-stuff threads. Add video and decoding to the mix and +we've got even more priority levels. + +So once we accept that synchronization objects (locks) are an +unavoidable fact of life, and once we accept that multi-task userspace +apps have a very fair expectation of being able to use locks, we've got +to think about how to offer the option of a deterministic locking +implementation to user-space. + +Most of the technical counter-arguments against doing priority +inheritance only apply to kernel-space locks. But user-space locks are +different, there we cannot disable interrupts or make the task +non-preemptible in a critical section, so the 'use spinlocks' argument +does not apply (user-space spinlocks have the same priority inversion +problems as other user-space locking constructs). Fact is, pretty much +the only technique that currently enables good determinism for userspace +locks (such as futex-based pthread mutexes) is priority inheritance: + +Currently (without PI), if a high-prio and a low-prio task shares a lock +[this is a quite common scenario for most non-trivial RT applications], +even if all critical sections are coded carefully to be deterministic +(i.e. all critical sections are short in duration and only execute a +limited number of instructions), the kernel cannot guarantee any +deterministic execution of the high-prio task: any medium-priority task +could preempt the low-prio task while it holds the shared lock and +executes the critical section, and could delay it indefinitely. + +Implementation: +--------------- + +As mentioned before, the userspace fastpath of PI-enabled pthread +mutexes involves no kernel work at all - they behave quite similarly to +normal futex-based locks: a 0 value means unlocked, and a value==TID +means locked. (This is the same method as used by list-based robust +futexes.) Userspace uses atomic ops to lock/unlock these mutexes without +entering the kernel. + +To handle the slowpath, we have added two new futex ops: + + FUTEX_LOCK_PI + FUTEX_UNLOCK_PI + +If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to +TID fails], then FUTEX_LOCK_PI is called. The kernel does all the +remaining work: if there is no futex-queue attached to the futex address +yet then the code looks up the task that owns the futex [it has put its +own TID into the futex value], and attaches a 'PI state' structure to +the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware, +kernel-based synchronization object. The 'other' task is made the owner +of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the +futex value. Then this task tries to lock the rt-mutex, on which it +blocks. Once it returns, it has the mutex acquired, and it sets the +futex value to its own TID and returns. Userspace has no other work to +perform - it now owns the lock, and futex value contains +FUTEX_WAITERS|TID. + +If the unlock side fastpath succeeds, [i.e. userspace manages to do a +TID -> 0 atomic transition of the futex value], then no kernel work is +triggered. + +If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), +then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the +behalf of userspace - and it also unlocks the attached +pi_state->rt_mutex and thus wakes up any potential waiters. + +Note that under this approach, contrary to previous PI-futex approaches, +there is no prior 'registration' of a PI-futex. [which is not quite +possible anyway, due to existing ABI properties of pthread mutexes.] + +Also, under this scheme, 'robustness' and 'PI' are two orthogonal +properties of futexes, and all four combinations are possible: futex, +robust-futex, PI-futex, robust+PI-futex. + +More details about priority inheritance can be found in +Documentation/rtmutex.txt. Index: linux-pi-futex.mm.q/Documentation/robust-futexes.txt =================================================================== --- linux-pi-futex.mm.q.orig/Documentation/robust-futexes.txt +++ linux-pi-futex.mm.q/Documentation/robust-futexes.txt @@ -95,7 +95,7 @@ comparison. If the thread has registered is empty. If the thread/process crashed or terminated in some incorrect way then the list might be non-empty: in this case the kernel carefully walks the list [not trusting it], and marks all locks that are owned by -this thread with the FUTEX_OWNER_DEAD bit, and wakes up one waiter (if +this thread with the FUTEX_OWNER_DIED bit, and wakes up one waiter (if any). The list is guaranteed to be private and per-thread at do_exit() time, Index: linux-pi-futex.mm.q/Documentation/rt-mutex.txt =================================================================== --- /dev/null +++ linux-pi-futex.mm.q/Documentation/rt-mutex.txt @@ -0,0 +1,74 @@ +RT-mutex subsystem with PI support +---------------------------------- + +RT-mutexes with priority inheritance are used to support PI-futexes, +which enable pthread_mutex_t priority inheritance attributes +(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details +about PI-futexes.] + +This technology was developed in the -rt tree and streamlined for +pthread_mutex support. + +Basic principles: +----------------- + +RT-mutexes extend the semantics of simple mutexes by the priority +inheritance protocol. + +A low priority owner of a rt-mutex inherits the priority of a higher +priority waiter until the rt-mutex is released. If the temporarily +boosted owner blocks on a rt-mutex itself it propagates the priority +boosting to the owner of the other rt_mutex it gets blocked on. The +priority boosting is immediately removed once the rt_mutex has been +unlocked. + +This approach allows us to shorten the block of high-prio tasks on +mutexes which protect shared resources. Priority inheritance is not a +magic bullet for poorly designed applications, but it allows +well-designed applications to use userspace locks in critical parts of +an high priority thread, without losing determinism. + +The enqueueing of the waiters into the rtmutex waiter list is done in +priority order. For same priorities FIFO order is chosen. For each +rtmutex, only the top priority waiter is enqueued into the owner's +priority waiters list. This list too queues in priority order. Whenever +the top priority waiter of a task changes (for example it timed out or +got a signal), the priority of the owner task is readjusted. [The +priority enqueueing is handled by "plists", see include/linux/plist.h +for more details.] + +RT-mutexes are optimized for fastpath operations and have no internal +locking overhead when locking an uncontended mutex or unlocking a mutex +without waiters. The optimized fastpath operations require cmpxchg +support. [If that is not available then the rt-mutex internal spinlock +is used] + +The state of the rt-mutex is tracked via the owner field of the rt-mutex +structure: + +rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 +are used to keep track of the "owner is pending" and "rtmutex has +waiters" state. + + owner bit1 bit0 + NULL 0 0 mutex is free (fast acquire possible) + NULL 0 1 invalid state + NULL 1 0 invalid state + NULL 1 1 invalid state + taskpointer 0 0 mutex is held (fast release possible) + taskpointer 0 1 task is pending owner + taskpointer 1 0 mutex is held and has waiters + taskpointer 1 1 task is pending owner and mutex has waiters + +Pending-ownership handling is a performance optimization: +pending-ownership is assigned to the first (highest priority) waiter of +the mutex, when the mutex is released. The thread is woken up and once +it starts executing it can acquire the mutex. Until the mutex is taken +by it (bit 0 is cleared) a competing higher priority thread can "steal" +the mutex which puts the woken up thread back on the waiters list. + +The pending-ownership optimization is especially important for the +uninterrupted workflow of high-prio tasks which repeatedly +takes/releases locks that have lower-prio waiters. Without this +optimization the higher-prio thread would ping-pong to the lower-prio +task [because at unlock time we always assign a new owner]. Index: linux-pi-futex.mm.q/Documentation/rtmutex.txt =================================================================== --- linux-pi-futex.mm.q.orig/Documentation/rtmutex.txt +++ /dev/null @@ -1,60 +0,0 @@ -RT Mutex Subsystem with PI support - -RT Mutexes with priority inheritance are used to support pthread_mutexes -with priority inheritance attributes. - -The basic technology was developed in the preempt-rt tree and -streamlined for the pthread_mutex support. - -RT Mutexes extend the semantics of Mutexes by the priority inheritance -protocol. Sharing code and data structures with the Mutex code is not -feasible due to the extended requirements of RT Mutexes. - -Basic operation principle: - -A low priority owner of a rt_mutex inherits the priority of a higher -priority waiter until the mutex is released. Is the temporary priority -boosted owner blocked on a rt_mutex itself it propagates the priority -boosting to the owner of the rt_mutex it is blocked on. The priority -boosting is immidiately removed once the rt_mutex has been unlocked. -This technology allows to shorten the blocking on mutexes which -protect shared resources. Priority inheritance is not a magic bullet -for poorly designed applications, but allows optimizations in cases -where the protection of shared resources might affect critical parts -of an high priority thread. - -The enqueueing of the waiters into the rtmutex waiter list is done in -priority order. In case of the same priority FIFO order is chosen. Per -rtmutex only the top priority waiter is enqueued into the owners -priority waiters list. Also this list enqueues in priority -order. Whenever the top priority waiter of a task is changed the -priority of the task is readjusted. The priority enqueueing is handled -by plists, see also include/linux/plist.h for further explanation. - -RT Mutexes are optimized for fastpath operations and have no runtime -overhead in case of locking an uncontended mutex or unlocking a mutex -without waiters. The optimized fathpath operations require cmpxchg -support. - -The state of the rtmutex is tracked via the owner field of the -rt_mutex structure: - -rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and -1 are used to keep track of the "owner is pending" and "rtmutex has -waiters" state. - -owner bit1 bit0 -NULL 0 0 mutex is free (fast acquire possible) -NULL 0 1 invalid state -NULL 1 0 invalid state -NULL 1 1 invalid state -taskpointer 0 0 mutex is held (fast release possible) -taskpointer 0 1 task is pending owner -taskpointer 1 0 mutex is held and has waiters -taskpointer 1 1 task is pending owner and mutex has more waiters - -Pending ownership is assigned to the first (highest priority) waiter -of the mutex, when the mutex is released. The thread is woken up and -can now acquire the mutex. Until the mutex is taken (bit 0 cleared) a -competing higher priority thread can steal the mutex which puts the -woken up thread back on the waiters list. Index: linux-pi-futex.mm.q/include/linux/futex.h =================================================================== --- linux-pi-futex.mm.q.orig/include/linux/futex.h +++ linux-pi-futex.mm.q/include/linux/futex.h @@ -14,6 +14,7 @@ #define FUTEX_WAKE_OP 5 #define FUTEX_LOCK_PI 6 #define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 /* * Support for robust futexes: the kernel cleans up held futexes at Index: linux-pi-futex.mm.q/include/linux/rtmutex.h =================================================================== --- linux-pi-futex.mm.q.orig/include/linux/rtmutex.h +++ linux-pi-futex.mm.q/include/linux/rtmutex.h @@ -29,7 +29,7 @@ struct rt_mutex { struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; - struct list_head held_list; + struct list_head held_list_entry; unsigned long acquire_ip; const char *name, *file; int line; @@ -66,7 +66,7 @@ struct hrtimer_sleeper; #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = SPIN_LOCK_UNLOCKED \ - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -98,11 +98,19 @@ extern int rt_mutex_trylock(struct rt_mu extern void rt_mutex_unlock(struct rt_mutex *lock); +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define INIT_RT_MUTEX_DEBUG(tsk) \ + .held_list_head = LIST_HEAD_INIT(tsk.held_list_head), \ + .held_list_lock = SPIN_LOCK_UNLOCKED +#else +# define INIT_RT_MUTEX_DEBUG(tsk) +#endif + #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ .pi_lock = SPIN_LOCK_UNLOCKED, \ - .pi_lock_chain = LIST_HEAD_INIT(tsk.pi_lock_chain), + INIT_RT_MUTEX_DEBUG(tsk) #else # define INIT_RT_MUTEXES(tsk) #endif Index: linux-pi-futex.mm.q/include/linux/sched.h =================================================================== --- linux-pi-futex.mm.q.orig/include/linux/sched.h +++ linux-pi-futex.mm.q/include/linux/sched.h @@ -865,9 +865,10 @@ struct task_struct { struct plist_head pi_waiters; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; - /* PI locking helpers */ - struct task_struct *pi_locked_by; - struct list_head pi_lock_chain; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spinlock_t held_list_lock; + struct list_head held_list_head; +# endif #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -984,7 +985,7 @@ static inline void put_task_struct(struc #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ -#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ +#define PF_MUTEX_TESTER 0x02000000 /* Thread belongs to the rt mutex tester */ /* * Only the _current_ task can read/write to tsk->flags, but other Index: linux-pi-futex.mm.q/include/linux/sysctl.h =================================================================== --- linux-pi-futex.mm.q.orig/include/linux/sysctl.h +++ linux-pi-futex.mm.q/include/linux/sysctl.h @@ -148,6 +148,7 @@ enum KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ + KERN_MAX_LOCK_DEPTH=73 }; Index: linux-pi-futex.mm.q/kernel/fork.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/fork.c +++ linux-pi-futex.mm.q/kernel/fork.c @@ -926,10 +926,12 @@ static inline void rt_mutex_init_task(st { #ifdef CONFIG_RT_MUTEXES spin_lock_init(&p->pi_lock); - plist_head_init(&p->pi_waiters); + plist_head_init(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; - p->pi_locked_by = NULL; - INIT_LIST_HEAD(&p->pi_lock_chain); +# ifdef CONFIG_DEBUG_RT_MUTEXES + spin_lock_init(&p->held_list_lock); + INIT_LIST_HEAD(&p->held_list_head); +# endif #endif } Index: linux-pi-futex.mm.q/kernel/futex.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/futex.c +++ linux-pi-futex.mm.q/kernel/futex.c @@ -480,6 +480,7 @@ lookup_pi_state(u32 uval, struct futex_h return 0; } } + /* * We are the first waiter - try to look up the real owner and * attach the new pi_state to it: @@ -930,9 +931,7 @@ static int unqueue_me(struct futex_q *q) WARN_ON(list_empty(&q->list)); list_del(&q->list); - if (q->pi_state) - free_pi_state(q->pi_state); - q->pi_state = NULL; + BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; @@ -942,6 +941,24 @@ static int unqueue_me(struct futex_q *q) return ret; } +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +{ + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(&hb->lock); + + drop_key_refs(&q->key); +} + static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) { struct task_struct *curr = current; @@ -1062,7 +1079,7 @@ static int futex_wait(u32 __user *uaddr, * races the kernel might see a 0 value of the futex too.) */ static int futex_lock_pi(u32 __user *uaddr, int detect, - unsigned long sec, long nsec) + unsigned long sec, long nsec, int trylock) { struct task_struct *curr = current; struct futex_hash_bucket *hb; @@ -1138,8 +1155,32 @@ static int futex_lock_pi(u32 __user *uad * we are the first waiter): */ ret = lookup_pi_state(uval, hb, &q); - if (unlikely(ret)) + + if (unlikely(ret)) { + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | FUTEX_OWNER_DIED; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } goto out_unlock_release_sem; + } /* * Only actually queue now that the atomic ops are done: @@ -1156,7 +1197,16 @@ static int futex_lock_pi(u32 __user *uad /* * Block on the PI mutex: */ - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + down_read(&curr->mm->mmap_sem); + hb = queue_lock(&q, -1, NULL); /* * Got the lock. We might not be the anticipated owner if we @@ -1165,9 +1215,6 @@ static int futex_lock_pi(u32 __user *uad if (!ret && q.pi_state->owner != curr) { u32 newtid = current->pid | FUTEX_WAITERS; - down_read(&curr->mm->mmap_sem); - hb = queue_lock(&q, -1, NULL); - /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock(&q.pi_state->owner->pi_lock); @@ -1182,7 +1229,8 @@ static int futex_lock_pi(u32 __user *uad list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock(¤t->pi_lock); - queue_unlock(&q, hb); + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); up_read(&curr->mm->mmap_sem); /* * We own it, so we have to replace the pending owner @@ -1200,10 +1248,21 @@ static int futex_lock_pi(u32 __user *uad break; uval = curval; } + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); } - unqueue_me(&q); - if (!detect && ret == -EDEADLK && 0) force_sig(SIGKILL, current); @@ -1652,11 +1711,14 @@ long do_futex(u32 __user *uaddr, int op, ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, val, timeout, val2); + ret = futex_lock_pi(uaddr, val, timeout, val2, 0); break; case FUTEX_UNLOCK_PI: ret = futex_unlock_pi(uaddr); break; + case FUTEX_TRYLOCK_PI: + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + break; default: ret = -ENOSYS; } Index: linux-pi-futex.mm.q/kernel/rtmutex-debug.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/rtmutex-debug.c +++ linux-pi-futex.mm.q/kernel/rtmutex-debug.c @@ -45,7 +45,8 @@ do { \ console_verbose(); \ if (spin_is_locked(¤t->pi_lock)) \ spin_unlock(¤t->pi_lock); \ - spin_unlock(&tracelock); \ + if (spin_is_locked(¤t->held_list_lock)) \ + spin_unlock(¤t->held_list_lock); \ } \ } while (0) @@ -83,14 +84,6 @@ do { \ # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) #endif - -/* - * We need a global lock when we walk through the multi-process - * lock tree... - */ -static spinlock_t tracelock = SPIN_LOCK_UNLOCKED; -static LIST_HEAD(held_locks); - /* * deadlock detection flag. We turn it off when we detect * the first problem because we dont want to recurse back @@ -183,7 +176,7 @@ static void show_task_locks(task_t *p) printk(" (not blocked)\n"); } -void rt_mutex_show_held_locks(task_t *filter) +void rt_mutex_show_held_locks(task_t *task, int verbose) { struct list_head *curr, *cursor = NULL; struct rt_mutex *lock; @@ -194,41 +187,32 @@ void rt_mutex_show_held_locks(task_t *fi if (!rt_trace_on) return; - if (filter) { + if (verbose) { printk("------------------------------\n"); printk("| showing all locks held by: | ("); - printk_task_short(filter); + printk_task_short(task); printk("):\n"); printk("------------------------------\n"); - } else { - printk("---------------------------\n"); - printk("| showing all locks held: |\n"); - printk("---------------------------\n"); } - /* - * Play safe and acquire the global trace lock. We - * cannot printk with that lock held so we iterate - * very carefully: - */ next: - spin_lock_irqsave(&tracelock, flags); - list_for_each(curr, &held_locks) { + spin_lock_irqsave(&task->held_list_lock, flags); + list_for_each(curr, &task->held_list_head) { if (cursor && curr != cursor) continue; - lock = list_entry(curr, struct rt_mutex, held_list); + lock = list_entry(curr, struct rt_mutex, held_list_entry); t = rt_mutex_owner(lock); - if (filter && (t != filter)) - continue; + WARN_ON(t != task); count++; cursor = curr->next; - spin_unlock_irqrestore(&tracelock, flags); + spin_unlock_irqrestore(&task->held_list_lock, flags); printk("\n#%03d: ", count); - printk_lock(lock, filter ? 0 : 1); + printk_lock(lock, 0); goto next; } - spin_unlock_irqrestore(&tracelock, flags); + spin_unlock_irqrestore(&task->held_list_lock, flags); + printk("\n"); } @@ -238,7 +222,10 @@ void rt_mutex_show_all_locks(void) int count = 10; int unlock = 1; - printk("\nshowing all tasks:\n"); + printk("\n"); + printk("----------------------\n"); + printk("| showing all tasks: |\n"); + printk("----------------------\n"); /* * Here we try to get the tasklist_lock as hard as possible, @@ -270,7 +257,19 @@ retry: } while_each_thread(g, p); printk("\n"); - rt_mutex_show_held_locks(NULL); + + printk("-----------------------------------------\n"); + printk("| showing all locks held in the system: |\n"); + printk("-----------------------------------------\n"); + + do_each_thread(g, p) { + rt_mutex_show_held_locks(p, 0); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("=============================================\n\n"); if (unlock) @@ -279,10 +278,9 @@ retry: void rt_mutex_debug_check_no_locks_held(task_t *task) { - struct list_head *curr, *next, *cursor = NULL; - struct rt_mutex *lock; struct rt_mutex_waiter *w; - unsigned long flags; + struct list_head *curr; + struct rt_mutex *lock; if (!rt_trace_on) return; @@ -291,25 +289,9 @@ void rt_mutex_debug_check_no_locks_held( printk_task(task); printk("\n"); } -restart: - spin_lock_irqsave(&tracelock, flags); - list_for_each_safe(curr, next, &held_locks) { - if (cursor && curr != cursor) - continue; - lock = list_entry(curr, struct rt_mutex, held_list); - if(task != rt_mutex_owner(lock)) - continue; - cursor = next; - list_del_init(curr); - spin_unlock_irqrestore(&tracelock, flags); + if (list_empty(&task->held_list_head)) + return; - printk("BUG: %s/%d, lock held at task exit time!\n", - task->comm, task->pid); - printk_lock(lock, 1); - if (rt_mutex_owner(lock) != task) - printk("exiting task is not even the owner??\n"); - goto restart; - } spin_lock(&task->pi_lock); plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { TRACE_OFF(); @@ -320,33 +302,36 @@ restart: return; } spin_unlock(&task->pi_lock); - spin_unlock_irqrestore(&tracelock, flags); + + list_for_each(curr, &task->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + + printk("BUG: %s/%d, lock held at task exit time!\n", + task->comm, task->pid); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != task) + printk("exiting task is not even the owner??\n"); + } } int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) { - struct list_head *curr, *next, *cursor = NULL; const void *to = from + len; + struct list_head *curr; struct rt_mutex *lock; unsigned long flags; void *lock_addr; - int err = 0; if (!rt_trace_on) - return err; -restart: - spin_lock_irqsave(&tracelock, flags); - list_for_each_safe(curr, next, &held_locks) { - if (cursor && curr != cursor) - continue; - lock = list_entry(curr, struct rt_mutex, held_list); + return 0; + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_for_each(curr, ¤t->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); lock_addr = lock; if (lock_addr < from || lock_addr >= to) continue; - cursor = next; - list_del_init(curr); TRACE_OFF(); - err = 1; printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", current->comm, current->pid, lock, from, to); @@ -354,19 +339,17 @@ restart: printk_lock(lock, 1); if (rt_mutex_owner(lock) != current) printk("freeing task is not even the owner??\n"); - goto restart; + return 1; } - spin_unlock_irqrestore(&tracelock, flags); + spin_unlock_irqrestore(¤t->held_list_lock, flags); - return err; + return 0; } -void rt_mutex_debug_task_free(struct task_struct *tsk) +void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&tsk->pi_waiters)); - WARN_ON(!list_empty(&tsk->pi_lock_chain)); - WARN_ON(tsk->pi_blocked_on); - WARN_ON(tsk->pi_locked_by); + WARN_ON(!plist_head_empty(&task->pi_waiters)); + WARN_ON(task->pi_blocked_on); } /* @@ -383,9 +366,10 @@ void debug_rt_mutex_deadlock(int detect, return; task = rt_mutex_owner(act_waiter->lock); - - act_waiter->deadlock_task_pid = task->pid; - act_waiter->deadlock_lock = lock; + if (task && task != current) { + act_waiter->deadlock_task_pid = task->pid; + act_waiter->deadlock_lock = lock; + } } void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) @@ -417,8 +401,8 @@ void debug_rt_mutex_print_deadlock(struc printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); printk_lock(waiter->deadlock_lock, 1); - rt_mutex_show_held_locks(current); - rt_mutex_show_held_locks(task); + rt_mutex_show_held_locks(current, 1); + rt_mutex_show_held_locks(task, 1); printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); show_stack(task, NULL); @@ -436,11 +420,11 @@ void debug_rt_mutex_lock(struct rt_mutex unsigned long flags; if (rt_trace_on) { - TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list)); + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); - spin_lock_irqsave(&tracelock, flags); - list_add_tail(&lock->held_list, &held_locks); - spin_unlock_irqrestore(&tracelock, flags); + spin_lock_irqsave(¤t->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, ¤t->held_list_head); + spin_unlock_irqrestore(¤t->held_list_lock, flags); lock->acquire_ip = ip; } @@ -452,11 +436,27 @@ void debug_rt_mutex_unlock(struct rt_mut if (rt_trace_on) { TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); - TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list)); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); - spin_lock_irqsave(&tracelock, flags); - list_del_init(&lock->held_list); - spin_unlock_irqrestore(&tracelock, flags); + spin_lock_irqsave(¤t->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + } +} + +void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&powner->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, &powner->held_list_head); + spin_unlock_irqrestore(&powner->held_list_lock, flags); + + lock->acquire_ip = ip; } } @@ -465,12 +465,14 @@ void debug_rt_mutex_proxy_unlock(struct unsigned long flags; if (rt_trace_on) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); - TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list)); + struct task_struct *owner = rt_mutex_owner(lock); + + TRACE_WARN_ON_LOCKED(!owner); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); - spin_lock_irqsave(&tracelock, flags); - list_del_init(&lock->held_list); - spin_unlock_irqrestore(&tracelock, flags); + spin_lock_irqsave(&owner->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(&owner->held_list_lock, flags); } } @@ -496,7 +498,7 @@ void debug_rt_mutex_init(struct rt_mutex if (rt_trace_on) { rt_mutex_debug_check_no_locks_freed(addr, sizeof(struct rt_mutex)); - INIT_LIST_HEAD(&lock->held_list); + INIT_LIST_HEAD(&lock->held_list_entry); lock->name = name; } } Index: linux-pi-futex.mm.q/kernel/rtmutex-debug.h =================================================================== --- linux-pi-futex.mm.q.orig/kernel/rtmutex-debug.h +++ linux-pi-futex.mm.q/kernel/rtmutex-debug.h @@ -21,10 +21,17 @@ extern void debug_rt_mutex_free_waiter(s extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_detect_deadlock(d) (1) # define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + int detect) +{ + return (waiter != NULL); +} Index: linux-pi-futex.mm.q/kernel/rtmutex.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/rtmutex.c +++ linux-pi-futex.mm.q/kernel/rtmutex.c @@ -1,11 +1,12 @@ /* * RT-Mutexes: simple blocking mutual exclusion locks with PI support * - * started by Ingo Molnar and Thomas Gleixner: + * started by Ingo Molnar and Thomas Gleixner. * * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen */ #include <linux/spinlock.h> #include <linux/module.h> @@ -133,211 +134,148 @@ static void __rt_mutex_adjust_prio(struc */ static void rt_mutex_adjust_prio(struct task_struct *task) { - unsigned long flags; - - spin_lock_irqsave(&task->pi_lock, flags); + spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); } /* - * PI-locking: we lock PI-dependencies opportunistically via trylock. - * - * In the overwhelming majority of cases the 'PI chain' is empty or at - * most 1-2 entries long, for which the trylock is sufficient, - * scalability-wise. The locking might look a bit scary, for which we - * apologize in advance :-) - * - * If any of the trylocks fails then we back out, task the global - * pi_conflicts_lock and take the locks again. This ensures deadlock-free - * but still scalable locking in the dependency graph, combined with - * the ability to reliably (and cheaply) detect user-space deadlocks. + * Max number of times we'll walk the boosting chain: */ -static DEFINE_SPINLOCK(pi_conflicts_lock); +int max_lock_depth = 1024; /* - * Lock the full boosting chain. - * - * If 'try' is set, we have to backout if we hit a owner who is - * running its own pi chain operation. We go back and take the slow - * path via the pi_conflicts_lock. - * - * We put all held locks into a list, via ->pi_lock_chain, and walk - * this list at unlock_pi_chain() time. - */ -static int lock_pi_chain(struct rt_mutex *act_lock, - struct rt_mutex_waiter *waiter, - struct list_head *lock_chain, - int try, int detect_deadlock) -{ - struct task_struct *owner; - struct rt_mutex *nextlock, *lock = act_lock; - struct rt_mutex_waiter *nextwaiter; - int deadlock_detect; + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(task_t *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter + __IP_DECL__) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); /* - * Debugging might turn deadlock detection on, unconditionally: + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. */ - deadlock_detect = debug_rt_mutex_detect_deadlock(detect_deadlock); - - for (;;) { - owner = rt_mutex_owner(lock); - - /* Check for circular dependencies */ - if (unlikely(owner->pi_locked_by == current)) { - debug_rt_mutex_deadlock(detect_deadlock, waiter, lock); - return detect_deadlock ? -EDEADLK : 1; - } - - while (!spin_trylock(&owner->pi_lock)) { - /* - * Owner runs its own chain. Go back and take - * the slow path - */ - if (try && owner->pi_locked_by == owner) - return -EBUSY; - cpu_relax(); - } - - BUG_ON(owner->pi_locked_by); - owner->pi_locked_by = current; - BUG_ON(!list_empty(&owner->pi_lock_chain)); - list_add(&owner->pi_lock_chain, lock_chain); + again: + if (++depth > max_lock_depth) { + static int prev_max; /* - * When the owner is blocked on a lock, try to take - * the lock: + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. */ - nextwaiter = owner->pi_blocked_on; - - /* End of chain? */ - if (!nextwaiter) - return 1; - - nextlock = nextwaiter->lock; - - /* Check for circular dependencies: */ - if (unlikely(nextlock == act_lock || - rt_mutex_owner(nextlock) == current)) { - debug_rt_mutex_deadlock(detect_deadlock, waiter, - nextlock); - list_del_init(&owner->pi_lock_chain); - owner->pi_locked_by = NULL; - spin_unlock(&owner->pi_lock); - return detect_deadlock ? -EDEADLK : 1; + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + task->comm, task->pid); } + put_task_struct(task); - /* Try to get nextlock->wait_lock: */ - if (unlikely(!spin_trylock(&nextlock->wait_lock))) { - list_del_init(&owner->pi_lock_chain); - owner->pi_locked_by = NULL; - spin_unlock(&owner->pi_lock); - cpu_relax(); - continue; - } - - lock = nextlock; + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + spin_lock(&task->pi_lock); - /* - * If deadlock detection is done (or has to be done, as - * for userspace locks), we have to walk the full chain - * unconditionally. - */ - if (deadlock_detect) - continue; + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter || !waiter->task) + goto out_unlock_pi; - /* - * Optimization: we only have to continue up to the point - * where boosting/unboosting still has to be done: - */ + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; - /* Boost or unboost? */ - if (waiter) { - /* If the top waiter has higher priority, stop: */ - if (rt_mutex_top_waiter(lock)->list_entry.prio <= - waiter->list_entry.prio) - return 1; - } else { - /* If nextwaiter is not the top waiter, stop: */ - if (rt_mutex_top_waiter(lock) != nextwaiter) - return 1; - } + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) { + BUG_ON(waiter->pi_list_entry.prio != waiter->list_entry.prio); + goto out_unlock_pi; } -} - -/* - * Unlock the pi_chain: - */ -static void unlock_pi_chain(struct list_head *lock_chain) -{ - struct task_struct *owner, *tmp; - list_for_each_entry_safe(owner, tmp, lock_chain, pi_lock_chain) { - struct rt_mutex_waiter *waiter = owner->pi_blocked_on; - - list_del_init(&owner->pi_lock_chain); - BUG_ON(!owner->pi_locked_by); - owner->pi_locked_by = NULL; - if (waiter) - spin_unlock(&waiter->lock->wait_lock); - spin_unlock(&owner->pi_lock); + lock = waiter->lock; + if (!spin_trylock(&lock->wait_lock)) { + spin_unlock(&task->pi_lock); + cpu_relax(); + goto retry; } -} -/* - * Do the priority (un)boosting along the chain: - */ -static void adjust_pi_chain(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct rt_mutex_waiter *top_waiter, - struct list_head *lock_chain) -{ - struct task_struct *owner = rt_mutex_owner(lock); - struct list_head *curr = lock_chain->prev; + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == current) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } - for (;;) { - if (top_waiter) - plist_del(&top_waiter->pi_list_entry, - &owner->pi_waiters); + top_waiter = rt_mutex_top_waiter(lock); - if (waiter) - waiter->pi_list_entry.prio = waiter->task->prio; + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); - if (rt_mutex_has_waiters(lock)) { - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&top_waiter->pi_list_entry, - &owner->pi_waiters); - } + /* Release the task */ + spin_unlock(&task->pi_lock); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + spin_lock(&task->pi_lock); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } - __rt_mutex_adjust_prio(owner); + get_task_struct(task); + spin_unlock(&task->pi_lock); - waiter = owner->pi_blocked_on; - if (!waiter || curr->prev == lock_chain) - return; - - curr = curr->prev; - lock = waiter->lock; - owner = rt_mutex_owner(lock); - top_waiter = rt_mutex_top_waiter(lock); + top_waiter = rt_mutex_top_waiter(lock); + spin_unlock(&lock->wait_lock); - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = waiter->task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; - /* - * We can stop here, if the waiter is/was not the top - * priority waiter: - */ - if (top_waiter != waiter && - waiter != rt_mutex_top_waiter(lock)) - return; + goto again; - /* - * Note: waiter is not necessarily the new top - * waiter! - */ - waiter = rt_mutex_top_waiter(lock); - } + out_unlock_pi: + spin_unlock(&task->pi_lock); + out_put_task: + put_task_struct(task); + return ret; } /* @@ -448,111 +386,65 @@ static int try_to_take_rt_mutex(struct r /* * Task blocks on lock. * - * Prepare waiter and potentially propagate our priority into the pi chain. + * Prepare waiter and propagate pi chain * * This must be called with lock->wait_lock held. - * return values: 1: waiter queued, 0: got the lock, - * -EDEADLK: deadlock detected. */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - int detect_deadlock __IP_DECL__) + int detect_deadlock + __IP_DECL__) { struct rt_mutex_waiter *top_waiter = waiter; - LIST_HEAD(lock_chain); - int res = 1; + task_t *owner = rt_mutex_owner(lock); + int boost = 0, res; + spin_lock(¤t->pi_lock); + __rt_mutex_adjust_prio(current); waiter->task = current; waiter->lock = lock; - debug_rt_mutex_reset_waiter(waiter); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; plist_node_init(&waiter->list_entry, current->prio); plist_node_init(&waiter->pi_list_entry, current->prio); - /* Get the top priority waiter of the lock: */ + /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); current->pi_blocked_on = waiter; - /* - * Call adjust_prio_chain, when waiter is the new top waiter - * or when deadlock detection is requested: - */ - if (waiter != rt_mutex_top_waiter(lock) && - !debug_rt_mutex_detect_deadlock(detect_deadlock)) - goto out_unlock_pi; - - /* Try to lock the full chain: */ - res = lock_pi_chain(lock, waiter, &lock_chain, 1, detect_deadlock); - - if (likely(res == 1)) - adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); - - /* Common case: we managed to lock it: */ - if (res != -EBUSY) - goto out_unlock_chain_pi; - - /* Rare case: we hit some other task running a pi chain operation: */ - unlock_pi_chain(&lock_chain); - - plist_del(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = NULL; - current->pi_locked_by = NULL; spin_unlock(¤t->pi_lock); - fixup_rt_mutex_waiters(lock); - - spin_unlock(&lock->wait_lock); - /* - * Here we have dropped all locks, and take the global - * pi_conflicts_lock. We have to redo all the work, no - * previous information about the lock is valid anymore: - */ - spin_lock(&pi_conflicts_lock); + if (waiter == rt_mutex_top_waiter(lock)) { + spin_lock(&owner->pi_lock); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); - spin_lock(&lock->wait_lock); - if (try_to_take_rt_mutex(lock __IP__)) { - /* - * Rare race: against all odds we got the lock. - */ - res = 0; - goto out; + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock(&owner->pi_lock); } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + spin_lock(&owner->pi_lock); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock(&owner->pi_lock); + } + if (!boost) + return 0; - WARN_ON(!rt_mutex_owner(lock) || rt_mutex_owner(lock) == current); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; - - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); - - /* Get the top priority waiter of the lock: */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - current->pi_blocked_on = waiter; - - /* Lock the full chain: */ - res = lock_pi_chain(lock, waiter, &lock_chain, 0, detect_deadlock); + spin_unlock(&lock->wait_lock); - /* Drop the conflicts lock before adjusting: */ - spin_unlock(&pi_conflicts_lock); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + waiter __IP__); - if (likely(res == 1)) - adjust_pi_chain(lock, waiter, top_waiter, &lock_chain); + spin_lock(&lock->wait_lock); - out_unlock_chain_pi: - unlock_pi_chain(&lock_chain); - out_unlock_pi: - current->pi_locked_by = NULL; - spin_unlock(¤t->pi_lock); - out: return res; } @@ -569,6 +461,8 @@ static void wakeup_next_waiter(struct rt struct rt_mutex_waiter *waiter; struct task_struct *pendowner; + spin_lock(¤t->pi_lock); + waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -578,15 +472,14 @@ static void wakeup_next_waiter(struct rt * boosted mode and go back to normal after releasing * lock->wait_lock. */ - spin_lock(¤t->pi_lock); plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - spin_unlock(¤t->pi_lock); - pendowner = waiter->task; waiter->task = NULL; rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + spin_unlock(¤t->pi_lock); + /* * Clear the pi_blocked_on variable and enqueue a possible * waiter into the pi_waiters list of the pending owner. This @@ -616,87 +509,52 @@ static void wakeup_next_waiter(struct rt /* * Remove a waiter from a lock * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held */ -static int remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter __IP_DECL__) +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter __IP_DECL__) { - struct rt_mutex_waiter *next_waiter = NULL, - *top_waiter = rt_mutex_top_waiter(lock); - LIST_HEAD(lock_chain); - int res; - - plist_del(&waiter->list_entry, &lock->wait_list); + int first = (waiter == rt_mutex_top_waiter(lock)); + int boost = 0; + task_t *owner = rt_mutex_owner(lock); spin_lock(¤t->pi_lock); - - if (waiter != top_waiter || rt_mutex_owner(lock) == current) - goto out; - - current->pi_locked_by = current; - - if (rt_mutex_has_waiters(lock)) - next_waiter = rt_mutex_top_waiter(lock); - - /* Try to lock the full chain: */ - res = lock_pi_chain(lock, next_waiter, &lock_chain, 1, 0); - - if (likely(res != -EBUSY)) { - adjust_pi_chain(lock, next_waiter, waiter, &lock_chain); - goto out_unlock; - } - - /* We hit some other task running a pi chain operation: */ - unlock_pi_chain(&lock_chain); - plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; - current->pi_locked_by = NULL; + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; spin_unlock(¤t->pi_lock); - spin_unlock(&lock->wait_lock); - - spin_lock(&pi_conflicts_lock); - - spin_lock(&lock->wait_lock); - - spin_lock(¤t->pi_lock); - current->pi_locked_by = current; - /* We might have been woken up: */ - if (!waiter->task) { - spin_unlock(&pi_conflicts_lock); - goto out; - } + if (first && owner != current) { - top_waiter = rt_mutex_top_waiter(lock); + spin_lock(&owner->pi_lock); - plist_del(&waiter->list_entry, &lock->wait_list); + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); - if (waiter != top_waiter || rt_mutex_owner(lock) == current) - goto out; + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; - /* Get the top priority waiter of the lock: */ - if (rt_mutex_has_waiters(lock)) - next_waiter = rt_mutex_top_waiter(lock); + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); - /* Lock the full chain: */ - lock_pi_chain(lock, next_waiter, &lock_chain, 0, 0); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock(&owner->pi_lock); + } - /* Drop the conflicts lock: */ - spin_unlock(&pi_conflicts_lock); + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - adjust_pi_chain(lock, next_waiter, waiter, &lock_chain); + if (!boost) + return; - out_unlock: - unlock_pi_chain(&lock_chain); - out: - current->pi_blocked_on = NULL; - waiter->task = NULL; - current->pi_locked_by = NULL; - spin_unlock(¤t->pi_lock); + spin_unlock(&lock->wait_lock); - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__); - return 0; + spin_lock(&lock->wait_lock); } /* @@ -709,21 +567,18 @@ rt_mutex_slowlock(struct rt_mutex *lock, { struct rt_mutex_waiter waiter; int ret = 0; - unsigned long flags; debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock_irqsave(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock __IP__)) { - spin_unlock_irqrestore(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); return 0; } - BUG_ON(rt_mutex_owner(lock) == current); - set_current_state(state); /* Setup the timer, when timeout != NULL */ @@ -753,25 +608,30 @@ rt_mutex_slowlock(struct rt_mutex *lock, /* * waiter.task is NULL the first time we come here and * when we have been woken up by the previous owner - * but the lock got stolen by an higher prio task. + * but the lock got stolen by a higher prio task. */ if (!waiter.task) { ret = task_blocks_on_rt_mutex(lock, &waiter, detect_deadlock __IP__); - /* got the lock or deadlock: */ - if (ret == 0 || ret == -EDEADLK) + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) + continue; + + if (unlikely(ret)) break; - ret = 0; } - - spin_unlock_irqrestore(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(&waiter); if (waiter.task) schedule_rt_mutex(lock); - spin_lock_irqsave(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); set_current_state(state); } @@ -786,10 +646,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock_irqrestore(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); /* Remove pending timer: */ - if (unlikely(timeout && timeout->task)) + if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); /* @@ -811,10 +671,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, static inline int rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) { - unsigned long flags; int ret = 0; - spin_lock_irqsave(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); if (likely(rt_mutex_owner(lock) != current)) { @@ -826,7 +685,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo fixup_rt_mutex_waiters(lock); } - spin_unlock_irqrestore(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); return ret; } @@ -837,9 +696,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - unsigned long flags; - - spin_lock_irqsave(&lock->wait_lock, flags); + spin_lock(&lock->wait_lock); debug_rt_mutex_unlock(lock); @@ -847,13 +704,13 @@ rt_mutex_slowunlock(struct rt_mutex *loc if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock_irqrestore(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); return; } wakeup_next_waiter(lock); - spin_unlock_irqrestore(&lock->wait_lock, flags); + spin_unlock(&lock->wait_lock); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); @@ -869,8 +726,8 @@ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock __IP_DECL__)) + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) { if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); @@ -883,8 +740,8 @@ static inline int rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock __IP_DECL__)) + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) { if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); @@ -974,7 +831,6 @@ rt_mutex_timed_lock(struct rt_mutex *loc } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); - /** * rt_mutex_trylock - try to lock a rt_mutex * @@ -1030,7 +886,7 @@ void __rt_mutex_init(struct rt_mutex *lo { lock->owner = NULL; spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); + plist_head_init(&lock->wait_list, &lock->wait_lock); debug_rt_mutex_init(lock, name); } @@ -1050,7 +906,7 @@ void rt_mutex_init_proxy_locked(struct r struct task_struct *proxy_owner) { __rt_mutex_init(lock, NULL); - debug_rt_mutex_lock(lock __RET_IP__); + debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); rt_mutex_set_owner(lock, proxy_owner, 0); rt_mutex_deadlock_account_lock(lock, proxy_owner); } Index: linux-pi-futex.mm.q/kernel/rtmutex.h =================================================================== --- linux-pi-futex.mm.q.orig/kernel/rtmutex.h +++ linux-pi-futex.mm.q/kernel/rtmutex.h @@ -19,10 +19,11 @@ #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) #define debug_rt_mutex_init(m, n) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_detect_deadlock(d) (d) +#define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) Index: linux-pi-futex.mm.q/kernel/sched.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/sched.c +++ linux-pi-futex.mm.q/kernel/sched.c @@ -1522,6 +1522,12 @@ void fastcall sched_fork(task_t *p, int * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); p->array = NULL; #ifdef CONFIG_SCHEDSTATS @@ -3893,7 +3899,6 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); @@ -3902,7 +3907,6 @@ static void __setscheduler(struct task_s */ if (policy == SCHED_BATCH) p->sleep_avg = 0; - set_load_weight(p); } Index: linux-pi-futex.mm.q/kernel/sysctl.c =================================================================== --- linux-pi-futex.mm.q.orig/kernel/sysctl.c +++ linux-pi-futex.mm.q/kernel/sysctl.c @@ -132,6 +132,10 @@ extern int acct_parm[]; extern int no_unaligned_warning; #endif +#ifdef CONFIG_RT_MUTEXES +extern int max_lock_depth; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -684,6 +688,17 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_RT_MUTEXES + { + .ctl_name = KERN_MAX_LOCK_DEPTH, + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; Index: linux-pi-futex.mm.q/scripts/rt-tester/t3-l1-pi-signal.tst =================================================================== --- linux-pi-futex.mm.q.orig/scripts/rt-tester/t3-l1-pi-signal.tst +++ linux-pi-futex.mm.q/scripts/rt-tester/t3-l1-pi-signal.tst @@ -69,15 +69,18 @@ W: locked: 0: 0 C: locknowait: 1: 0 W: blocked: 1: 0 T: prioeq: 0: 80 +T: prioeq: 1: 80 # T2 lock L0 interruptible, no wait in the wakeup path C: lockintnowait: 2: 0 W: blocked: 2: 0 T: prioeq: 0: 81 +T: prioeq: 1: 80 # Interrupt T2 C: signal: 2: 2 W: unlocked: 2: 0 +T: prioeq: 1: 80 T: prioeq: 0: 80 T: locked: 0: 0 ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [patch] PI-futex patchset: -V4 2006-03-31 19:14 ` [patch] PI-futex patchset: -V4 Ingo Molnar @ 2006-03-31 19:23 ` Ingo Molnar 2006-04-01 0:19 ` Peter Williams 1 sibling, 0 replies; 12+ messages in thread From: Ingo Molnar @ 2006-03-31 19:23 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, tglx, torvalds, Esben Nielsen * Ingo Molnar <mingo@elte.hu> wrote: > this is version -V4 of the PI-futex patchset (ontop of current -mm2, > which includes -V3.) > > A clean queue of split-up patches can be found at: > > http://redhat.com/~mingo/PI-futex-patches/PI-futex-patches-V4.tar.gz oops, the plist debugging changes were missing from the delta. Find incremental patch ontop of -V4 below. The tarball above contains all the needed patches. Ingo ----- PI-futex -V4 part #2 Signed-off-by: Ingo Molnar <mingo@elte.hu> -- include/asm-generic/bug.h | 6 ++++++ include/linux/plist.h | 29 +++++++++++++++++++++++++---- lib/plist.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 4 deletions(-) Index: linux-pi-futex.mm.q/include/asm-generic/bug.h =================================================================== --- linux-pi-futex.mm.q.orig/include/asm-generic/bug.h +++ linux-pi-futex.mm.q/include/asm-generic/bug.h @@ -39,4 +39,10 @@ #endif #endif +#ifdef CONFIG_SMP +# define WARN_ON_SMP(x) WARN_ON(x) +#else +# define WARN_ON_SMP(x) do { } while (0) +#endif + #endif Index: linux-pi-futex.mm.q/include/linux/plist.h =================================================================== --- linux-pi-futex.mm.q.orig/include/linux/plist.h +++ linux-pi-futex.mm.q/include/linux/plist.h @@ -79,6 +79,9 @@ struct plist_head { struct list_head prio_list; struct list_head node_list; +#ifdef CONFIG_DEBUG_PI_LIST + spinlock_t *lock; +#endif }; struct plist_node { @@ -86,15 +89,22 @@ struct plist_node { struct plist_head plist; }; +#ifdef CONFIG_DEBUG_PI_LIST +# define PLIST_HEAD_LOCK_INIT(_lock) .lock = _lock +#else +# define PLIST_HEAD_LOCK_INIT(_lock) +#endif + /** * #PLIST_HEAD_INIT - static struct plist_head initializer * * @head: struct plist_head variable name */ -#define PLIST_HEAD_INIT(head) \ +#define PLIST_HEAD_INIT(head, _lock) \ { \ .prio_list = LIST_HEAD_INIT((head).prio_list), \ .node_list = LIST_HEAD_INIT((head).node_list), \ + PLIST_HEAD_LOCK_INIT(&(_lock)) \ } /** @@ -115,10 +125,13 @@ struct plist_node { * @head: &struct plist_head pointer */ static inline void -plist_head_init(struct plist_head *head) +plist_head_init(struct plist_head *head, spinlock_t *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); +#ifdef CONFIG_DEBUG_PI_LIST + head->lock = lock; +#endif } /** @@ -130,7 +143,7 @@ plist_head_init(struct plist_head *head) static inline void plist_node_init(struct plist_node *node, int prio) { node->prio = prio; - plist_head_init(&node->plist); + plist_head_init(&node->plist, NULL); } extern void plist_add(struct plist_node *node, struct plist_head *head); @@ -207,8 +220,16 @@ static inline int plist_node_empty(const * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. */ -#define plist_first_entry(head, type, member) \ +#ifdef CONFIG_DEBUG_PI_LIST +# define plist_first_entry(head, type, member) \ +({ \ + WARN_ON(plist_head_empty(head)); \ + container_of(plist_first(head), type, member); \ +}) +#else +# define plist_first_entry(head, type, member) \ container_of(plist_first(head), type, member) +#endif /** * plist_first - return the first node (and thus, highest priority) Index: linux-pi-futex.mm.q/lib/plist.c =================================================================== --- linux-pi-futex.mm.q.orig/lib/plist.c +++ linux-pi-futex.mm.q/lib/plist.c @@ -26,6 +26,44 @@ #include <linux/plist.h> #include <linux/spinlock.h> +#ifdef CONFIG_DEBUG_PI_LIST + +static void plist_check_prev_next(struct list_head *t, struct list_head *p, + struct list_head *n) +{ + if (n->prev != p || p->next != n) { + printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev); + printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev); + printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev); + WARN_ON(1); + } +} + +static void plist_check_list(struct list_head *top) +{ + struct list_head *prev = top, *next = top->next; + + plist_check_prev_next(top, prev, next); + while (next != top) { + prev = next; + next = prev->next; + plist_check_prev_next(top, prev, next); + } +} + +static void plist_check_head(struct plist_head *head) +{ + WARN_ON(!head->lock); + if (head->lock) + WARN_ON_SMP(!spin_is_locked(head->lock)); + plist_check_list(&head->prio_list); + plist_check_list(&head->node_list); +} + +#else +# define plist_check_head(h) do { } while (0) +#endif + /** * plist_add - add @node to @head * @@ -36,6 +74,7 @@ void plist_add(struct plist_node *node, { struct plist_node *iter; + plist_check_head(head); WARN_ON(!plist_node_empty(node)); list_for_each_entry(iter, &head->prio_list, plist.prio_list) { @@ -52,6 +91,8 @@ lt_prio: list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); eq_prio: list_add_tail(&node->plist.node_list, &iter->plist.node_list); + + plist_check_head(head); } /** @@ -62,6 +103,8 @@ eq_prio: */ void plist_del(struct plist_node *node, struct plist_head *head) { + plist_check_head(head); + if (!list_empty(&node->plist.prio_list)) { struct plist_node *next = plist_first(&node->plist); @@ -70,4 +113,6 @@ void plist_del(struct plist_node *node, } list_del_init(&node->plist.node_list); + + plist_check_head(head); } ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [patch] PI-futex patchset: -V4 2006-03-31 19:14 ` [patch] PI-futex patchset: -V4 Ingo Molnar 2006-03-31 19:23 ` Ingo Molnar @ 2006-04-01 0:19 ` Peter Williams 1 sibling, 0 replies; 12+ messages in thread From: Peter Williams @ 2006-04-01 0:19 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, linux-kernel, tglx, torvalds, Esben Nielsen Ingo Molnar wrote: > this is version -V4 of the PI-futex patchset (ontop of current -mm2, > which includes -V3.) > > A clean queue of split-up patches can be found at: > > http://redhat.com/~mingo/PI-futex-patches/PI-futex-patches-V4.tar.gz > > the -V4 codebase has been tested on the glibc code (all testcases pass) > and under load as well. (The -V4 code is included in the 2.6.16-rt12 > code as well that i released earlier today.) > > Changes since -V3: > > - added Esben Nielsen's PI locking code, Thomas Gleixner made it > work in cornercases and under load. This is significantly simpler (it > removes 50 lines of code from rtmutex.c). The main difference is that > instead of holding all locks along a dependency chain, this code > propagates PI priorities (and detects deadlocks) by holding at most > two locks at once, and by being preemptible between such steps. > > - Jakub Jelinek did a detailed review of the new futex code and found > some new races, which Thomas Gleixner fixed. > > - to fix a pthread_mutex_trylock() related race, FUTEX_TRYLOCK_PI has > been added (Thomas Gleixner) > > - documentation fixes based on feedback from Tim Bird > > - added Documentation/pi-futex.txt (in addition to rt-mutex.txt) > > - added the plist debugging patch (which was part of -rt but wasnt part > of the pi-futex queue before). This caught a couple of SMP bugs in > the past. > > - implemented more scalable held-locks debugging - it's now a per-task > list of held locks, instead of a global list. This is similarly > effective to the global list, but much more scalable. (This approach > will also be added to the stock kernel/mutex.c code.) > > - do not fiddle with irq flags in rtmutex.c - it's not needed. > > - clone/fork fix: do not let parent's potential PI priority 'leak' into > child threads or processes. > > - added /proc/sys/kernel/max_lock_depth with a default limit of 1024, > to limit the amount of deadlock-checking the kernel will do. > > - small enhancement to the t3-l1-pi-signal.tst testcase. Wouldn't this be a good opportunity to redefine SCHED_BATCH as 4 instead of 3 so that you can use ((p->policy & (SCHED_FIFO|SCHED_RR)) == 0) instead of (p->policy != SCHED_NORMAL && p->policy != SCHED_BATCH)? That expression will be called fairly frequently and SCHED_BATCH hasn't been around long enough for a change of value to break very much use space code. Peter -- Peter Williams pwil3058@bigpond.net.au "Learning, n. The kind of ignorance distinguishing the studious." -- Ambrose Bierce ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2006-04-01 0:19 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2006-03-25 18:46 [patch 05/10] PI-futex: rt-mutex core Ingo Molnar 2006-03-26 5:35 ` Andrew Morton 2006-03-26 23:33 ` Ingo Molnar 2006-03-26 6:07 ` Andrew Morton 2006-03-26 16:03 ` [patch] PI-futex: -V2 Ingo Molnar 2006-03-26 19:16 ` Andrew Morton 2006-03-26 23:30 ` Ingo Molnar 2006-03-26 23:16 ` [patch] PI-futex: -V3 Ingo Molnar 2006-03-28 10:17 ` [patch-mm2] PI-futex: fix timeout race Thomas Gleixner 2006-03-31 19:14 ` [patch] PI-futex patchset: -V4 Ingo Molnar 2006-03-31 19:23 ` Ingo Molnar 2006-04-01 0:19 ` Peter Williams
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.