* [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2)
@ 2007-01-03 12:32 Pierre Peiffer
2007-01-03 12:35 ` Ingo Molnar
0 siblings, 1 reply; 6+ messages in thread
From: Pierre Peiffer @ 2007-01-03 12:32 UTC (permalink / raw)
To: LKML
Cc: Dinakar Guniguntala, Jean-Pierre Dion, Ingo Molnar,
Sébastien Dugué, Ulrich Drepper, Darren Hart
Hi,
First, thanks Ingo for your comments on my previous mail from december.
I've taken all your remarks into account.
The 64-bit and compat versions have been implemented and tested.
The glibc part has also been updated and the x86_64 version is now implemented too.
Here after is the updated patch for kernel 2.6.19-rt15.
Comments, suggestions, feedbacks are welcome, as usual.
---
With the introduction of the PI-futex (used for the PI-pthread_mutex in
the glibc), the current futex_requeue optimization (*) can not be used
any more if the pthread_mutex used with the pthread-condvar is a PI-mutex.
(*) this optimization is used in pthread_cond_broadcast.
To use this optimization with PI-mutex, we need a function that
re-queues some threads from a normal futex to a PI-futex (see why in
this discussion:
http://marc.theaimsgroup.com/?l=linux-kernel&m=115020355126851&w=2 )
Here is a proposal of an implementation of futex_requeue_pi.
To use or test this patch with pthreads, you need to patch the
glibc with the patch available here:
http://www.bullopensource.org/posix/pi-futex/glibc_requeue_pi.diff
(Only available for ix86 and x86_64 architecture)
or use the generic pthread_cond_broadcast.c file
(patch here:
http://www.bullopensource.org/posix/pi-futex/glibc_pth_broadcast_c_file.diff
)
---
include/linux/futex.h | 8
kernel/futex.c | 1076 ++++++++++++++++++++++++++++++++++++++++++------
kernel/futex_compat.c | 3
kernel/rtmutex.c | 43 -
kernel/rtmutex_common.h | 36 +
5 files changed, 1013 insertions(+), 153 deletions(-)
---
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
---
Index: linux-2.6/include/linux/futex.h
===================================================================
--- linux-2.6.orig/include/linux/futex.h 2006-12-18 09:24:07.000000000 +0100
+++ linux-2.6/include/linux/futex.h 2006-12-18 09:24:50.000000000 +0100
@@ -15,6 +15,7 @@
#define FUTEX_LOCK_PI 6
#define FUTEX_UNLOCK_PI 7
#define FUTEX_TRYLOCK_PI 8
+#define FUTEX_CMP_REQUEUE_PI 9
/*
* Support for robust futexes: the kernel cleans up held futexes at
@@ -83,9 +84,14 @@ struct robust_list_head {
#define FUTEX_OWNER_DIED 0x40000000
/*
+ * Some processes have been requeued on this PI-futex
+ */
+#define FUTEX_WAITER_REQUEUED 0x20000000
+
+/*
* The rest of the robust-futex field is for the TID:
*/
-#define FUTEX_TID_MASK 0x3fffffff
+#define FUTEX_TID_MASK 0x0fffffff
/*
* This limit protects against a deliberately circular list.
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c 2006-12-18 09:24:08.000000000 +0100
+++ linux-2.6/kernel/futex.c 2006-12-18 09:24:50.000000000 +0100
@@ -53,6 +53,12 @@
#include "rtmutex_common.h"
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -128,6 +134,12 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
+
+ /*
+ * This waiter is used in case of requeue from a
+ * normal futex to a PI-futex
+ */
+ struct rt_mutex_waiter waiter;
};
/*
@@ -249,6 +261,25 @@ static int get_futex_key(void *uaddr, un
}
/*
+ * Retrieve the original address used to compute this key
+ */
+static void *get_futex_address(union futex_key *key)
+{
+ void *uaddr;
+
+ if (key->both.offset & 1) {
+ /* shared mapping */
+ uaddr = (void*)((key->shared.pgoff << PAGE_SHIFT)
+ + key->shared.offset - 1);
+ } else {
+ /* private mapping */
+ uaddr = (void*)(key->private.address + key->private.offset);
+ }
+
+ return uaddr;
+}
+
+/*
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
@@ -463,7 +494,8 @@ void exit_pi_state_list(struct task_stru
}
static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
@@ -474,7 +506,7 @@ lookup_pi_state(u32 uval, struct futex_h
head = &hb->chain;
plist_for_each_entry_safe(this, next, head, list) {
- if (match_futex(&this->key, &me->key)) {
+ if (match_futex(&this->key, key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
@@ -489,7 +521,7 @@ lookup_pi_state(u32 uval, struct futex_h
WARN_ON(!atomic_read(&pi_state->refcount));
atomic_inc(&pi_state->refcount);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -516,7 +548,7 @@ lookup_pi_state(u32 uval, struct futex_h
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
- pi_state->key = me->key;
+ pi_state->key = *key;
spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
@@ -526,7 +558,7 @@ lookup_pi_state(u32 uval, struct futex_h
put_task_struct(p);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -563,11 +595,12 @@ static int wake_futex_pi(u32 __user *uad
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 curval, newval;
+ unsigned long flags;
if (!pi_state)
return -EINVAL;
- spin_lock(&pi_state->pi_mutex.wait_lock);
+ spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -586,28 +619,40 @@ static int wake_futex_pi(u32 __user *uad
*/
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
+ /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
dec_preempt_count();
- if (curval == -EFAULT)
+ if (curval == -EFAULT) {
+ spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock,
+ flags);
return -EFAULT;
- if (curval != uval)
+ }
+ if (curval != uval) {
+ spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock,
+ flags);
return -EINVAL;
+ }
+
+ if (uval & FUTEX_WAITER_REQUEUED)
+ rt_mutex_deadlock_account_lock(&pi_state->pi_mutex,
+ new_owner);
}
- spin_lock_irq(&pi_state->owner->pi_lock);
+ spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- spin_unlock_irq(&pi_state->owner->pi_lock);
+ spin_unlock(&pi_state->owner->pi_lock);
- spin_lock_irq(&new_owner->pi_lock);
+ spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- spin_unlock_irq(&new_owner->pi_lock);
+ spin_unlock(&new_owner->pi_lock);
+ spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
- spin_unlock(&pi_state->pi_mutex.wait_lock);
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
@@ -650,6 +695,253 @@ double_lock_hb(struct futex_hash_bucket
}
/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **pi_state)
+
+{
+ u32 curval, uval, newval;
+
+retry:
+ /*
+ * We can't handle a fault cleanly because we can't
+ * release the locks here. Simply return the fault.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+ if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+ != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+ /*
+ * No waiters yet, we prepare the futex to have some waiters.
+ */
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+ }
+
+ if (!(curval & FUTEX_TID_MASK)
+ || lookup_pi_state(curval, hb, key, pi_state)) {
+ /* the futex has no owner (yet) or the lookup failed:
+ allocate one pi_state without owner */
+
+ *pi_state = alloc_pi_state();
+
+ /* Already stores the key: */
+ (*pi_state)->key = *key;
+
+ /* init the mutex without owner */
+ __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+ }
+
+ return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state2 = NULL;
+ struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+ struct rt_mutex *lock2 = NULL;
+ int ret, drop_count = 0;
+ unsigned long flags = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+retry:
+ /*
+ * First take all the futex related locks:
+ */
+ down_read(¤t->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr1, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * FIRST: get and set the pi_state
+ */
+ if (!pi_state2) {
+ int s;
+ /* do this only the first time we requeue someone */
+ s = lookup_pi_state_for_requeue(uaddr2, hb2,
+ &key2, &pi_state2);
+ if (s) {
+ ret = s;
+ goto out_unlock;
+ }
+
+ lock2 = &pi_state2->pi_mutex;
+ spin_lock_irqsave(&lock2->wait_lock, flags);
+
+ /* Save the top waiter of the wait_list */
+ if (rt_mutex_has_waiters(lock2))
+ top_waiter = rt_mutex_top_waiter(lock2);
+ } else
+ atomic_inc(&pi_state2->refcount);
+
+
+ this->pi_state = pi_state2;
+
+ /*
+ * SECOND: requeue futex_q to the correct hashbucket
+ */
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+ }
+ this->key = key2;
+ get_key_refs(&key2);
+ drop_count++;
+
+
+ /*
+ * THIRD: queue it to lock2
+ */
+ spin_lock(&this->task->pi_lock);
+ waiter = &this->waiter;
+ waiter->task = this->task;
+ waiter->lock = lock2;
+ plist_node_init(&waiter->list_entry, this->task->prio);
+ plist_node_init(&waiter->pi_list_entry, this->task->prio);
+ plist_add(&waiter->list_entry, &lock2->wait_list);
+ this->task->pi_blocked_on = waiter;
+ spin_unlock(&this->task->pi_lock);
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+ /* If we've requeued some tasks and the top_waiter of the rt_mutex
+ has changed, we must adjust the priority of the owner, if any */
+ if (drop_count) {
+ struct task_struct *owner = rt_mutex_owner(lock2);
+ if (owner &&
+ (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+ int chain_walk = 0;
+
+ spin_lock(&owner->pi_lock);
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ else
+ /*
+ * There was no waiters before the requeue,
+ * the flag must be updated
+ */
+ mark_rt_mutex_waiters(lock2);
+
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ chain_walk = 1;
+ get_task_struct(owner);
+ }
+
+ spin_unlock(&owner->pi_lock);
+ spin_unlock_irqrestore(&lock2->wait_lock, flags);
+
+ if (chain_walk)
+ rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+ current);
+ } else {
+ /* No owner or the top_waiter does not change */
+ mark_rt_mutex_waiters(lock2);
+ spin_unlock_irqrestore(&lock2->wait_lock, flags);
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_key_refs(&key1);
+
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
+/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
@@ -990,9 +1282,10 @@ static int unqueue_me(struct futex_q *q)
/*
* PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
*/
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
{
WARN_ON(plist_node_empty(&q->list));
plist_del(&q->list, &q->list.plist);
@@ -1001,11 +1294,65 @@ static void unqueue_me_pi(struct futex_q
free_pi_state(q->pi_state);
q->pi_state = NULL;
- spin_unlock(&hb->lock);
+ spin_unlock(q->lock_ptr);
drop_key_refs(&q->key);
}
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct futex_hash_bucket *hb,
+ struct task_struct *curr)
+{
+ u32 newtid = curr->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u32 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = curr;
+
+ spin_lock_irq(&curr->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &curr->pi_state_list);
+ spin_unlock_irq(&curr->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(q);
+ up_read(&curr->mm->mmap_sem);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time)
{
struct task_struct *curr = current;
@@ -1014,7 +1361,7 @@ static int futex_wait(u32 __user *uaddr,
struct futex_q q;
u32 uval;
int ret;
- struct hrtimer_sleeper t;
+ struct hrtimer_sleeper t, *to = NULL;
int rem = 0;
q.pi_state = NULL;
@@ -1068,6 +1415,14 @@ static int futex_wait(u32 __user *uaddr,
if (uval != val)
goto out_unlock_release_sem;
+ /*
+ * This rt_mutex_waiter structure is prepared here and will
+ * be used only if this task is requeued from a normal futex to
+ * a PI-futex with futex_requeue_pi.
+ */
+ debug_rt_mutex_init_waiter(&q.waiter);
+ q.waiter.task = NULL;
+
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb);
@@ -1101,7 +1456,7 @@ static int futex_wait(u32 __user *uaddr,
if (time->tv_sec == 0 && time->tv_nsec == 0)
schedule();
else {
-
+ to = &t;
hrtimer_init(&t.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
hrtimer_init_sleeper(&t, current);
@@ -1133,6 +1488,67 @@ static int futex_wait(u32 __user *uaddr,
* we are the only user of it.
*/
+ if (q.pi_state) {
+ /*
+ * We were woken but have been requeued on a PI-futex.
+ * We have to complete the lock acquisition by taking
+ * the rtmutex.
+ */
+
+ struct rt_mutex *lock = &q.pi_state->pi_mutex;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+ if (unlikely(q.waiter.task)) {
+ remove_waiter(lock, &q.waiter, flags);
+ }
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (rem)
+ ret = -ETIMEDOUT;
+ else
+ ret = rt_mutex_timed_lock(lock, to, 1);
+
+ down_read(&curr->mm->mmap_sem);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ /*
+ * We MUST play with the futex we were requeued on,
+ * NOT the current futex.
+ * We can retrieve it from the key of the pi_state
+ */
+ uaddr = get_futex_address(&q.pi_state->key);
+
+ /* mmap_sem and hash_bucket lock are unlocked at
+ return of this function */
+ ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ up_read(&curr->mm->mmap_sem);
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
+ return ret;
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
@@ -1152,6 +1568,52 @@ static int futex_wait(u32 __user *uaddr,
return ret;
}
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+ union futex_key *key, struct task_struct *p)
+{
+ struct plist_head *head;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex *lock;
+ unsigned long flags;
+
+ /* Search a waiter that should already exists */
+
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, key)) {
+ pi_state = this->pi_state;
+ break;
+ }
+ }
+
+ BUG_ON(!pi_state);
+
+ /* set p as pi_state's owner */
+ atomic_inc(&pi_state->refcount);
+
+ lock = &pi_state->pi_mutex;
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+ spin_lock(&p->pi_lock);
+
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+
+ debug_rt_mutex_proxy_lock(lock, p);
+ WARN_ON(rt_mutex_owner(lock));
+ rt_mutex_set_owner(lock, p, 0);
+ rt_mutex_deadlock_account_lock(lock, p);
+
+ plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+ &p->pi_waiters);
+ __rt_mutex_adjust_prio(p);
+
+ spin_unlock(&p->pi_lock);
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+}
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
@@ -1166,7 +1628,7 @@ static int futex_lock_pi(u32 __user *uad
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_held, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
@@ -1189,6 +1651,8 @@ static int futex_lock_pi(u32 __user *uad
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ lock_held = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1207,7 +1671,16 @@ static int futex_lock_pi(u32 __user *uad
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
- ret = -EDEADLK;
+ /*
+ * Normally, this check is done in user space.
+ * In case of requeue, the owner may attempt to lock this futex,
+ * even if the ownership has already been given by the previous
+ * waker.
+ * In the usual case, this is a case of deadlock, but not in case
+ * of REQUEUE_PI.
+ */
+ if (!(curval & FUTEX_WAITER_REQUEUED))
+ ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -1219,7 +1692,18 @@ static int futex_lock_pi(u32 __user *uad
goto out_unlock_release_sem;
uval = curval;
- newval = uval | FUTEX_WAITERS;
+ /*
+ * In case of a requeue, check if there already is an owner
+ * If not, just take the futex.
+ */
+ if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+ /* set current as futex owner */
+ newval = curval | current->pid;
+ lock_held = 1;
+ } else
+ /* Set the WAITERS flag, so the owner will know it has someone
+ to wake at next unlock */
+ newval = curval | FUTEX_WAITERS;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1230,11 +1714,16 @@ static int futex_lock_pi(u32 __user *uad
if (unlikely(curval != uval))
goto retry_locked;
+ if (lock_held) {
+ set_pi_futex_owner(hb, &q.key, curr);
+ goto out_unlock_release_sem;
+ }
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, &q);
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
if (unlikely(ret)) {
/*
@@ -1297,45 +1786,10 @@ static int futex_lock_pi(u32 __user *uad
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
- if (!ret && q.pi_state->owner != curr) {
- u32 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(¤t->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, ¤t->pi_state_list);
- spin_unlock_irq(¤t->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
- /*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
- */
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- } else {
+ if (!ret && q.pi_state->owner != curr)
+ /* mmap_sem is unlocked at return of this function */
+ ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
@@ -1346,7 +1800,7 @@ static int futex_lock_pi(u32 __user *uad
ret = 0;
}
/* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
+ unqueue_me_pi(&q);
up_read(&curr->mm->mmap_sem);
}
@@ -1715,6 +2169,8 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+ mval |= (uval & FUTEX_WAITER_REQUEUED);
nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
if (nval == -EFAULT)
@@ -1845,6 +2301,9 @@ long do_futex(u32 __user *uaddr, int op,
case FUTEX_TRYLOCK_PI:
ret = futex_lock_pi(uaddr, 0, timeout, 1);
break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+ break;
default:
ret = -ENOSYS;
}
@@ -1864,16 +2323,71 @@ static inline int get_futex_value_locked
return ret ? -EFAULT : 0;
}
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner64(u64 __user *uaddr, struct futex_q *q,
+ struct futex_hash_bucket *hb,
+ struct task_struct *curr)
+{
+ u64 newtid = curr->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u64 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = curr;
+
+ spin_lock_irq(&curr->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &curr->pi_state_list);
+ spin_unlock_irq(&curr->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(q);
+ up_read(&curr->mm->mmap_sem);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
+ curval = futex_atomic_cmpxchg_inatomic64(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
static int wake_futex_pi64(u64 __user *uaddr, u64 uval, struct futex_q *this)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u64 curval, newval;
+ unsigned long flags;
if (!pi_state)
return -EINVAL;
- spin_lock(&pi_state->pi_mutex.wait_lock);
+ spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -1892,28 +2406,40 @@ static int wake_futex_pi64(u64 __user *u
*/
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
+ /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic64(uaddr, uval, newval);
dec_preempt_count();
- if (curval == -EFAULT)
+ if (curval == -EFAULT) {
+ spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock,
+ flags);
return -EFAULT;
- if (curval != uval)
+ }
+ if (curval != uval) {
+ spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock,
+ flags);
return -EINVAL;
+ }
+
+ if (uval & FUTEX_WAITER_REQUEUED)
+ rt_mutex_deadlock_account_lock(&pi_state->pi_mutex,
+ new_owner);
}
- spin_lock_irq(&pi_state->owner->pi_lock);
+ spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- spin_unlock_irq(&pi_state->owner->pi_lock);
+ spin_unlock(&pi_state->owner->pi_lock);
- spin_lock_irq(&new_owner->pi_lock);
+ spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- spin_unlock_irq(&new_owner->pi_lock);
+ spin_unlock(&new_owner->pi_lock);
- spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
@@ -2183,6 +2709,254 @@ out:
return ret;
}
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue64(u64 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **pi_state)
+
+{
+ u64 curval, uval, newval;
+
+retry:
+ /*
+ * We can't handle a fault cleanly because we can't
+ * release the locks here. Simply return the fault.
+ */
+ if (get_futex_value_locked64(&curval, uaddr))
+ return -EFAULT;
+
+ /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+ if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+ != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+ /*
+ * No waiters yet, we prepare the futex to have some waiters.
+ */
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic64(uaddr, uval, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+ }
+
+ if (!(curval & FUTEX_TID_MASK)
+ || lookup_pi_state(curval, hb, key, pi_state)) {
+ /* the futex has no owner (yet) or the lookup failed:
+ allocate one pi_state without owner */
+
+ *pi_state = alloc_pi_state();
+
+ /* Already stores the key: */
+ (*pi_state)->key = *key;
+
+ /* init the mutex without owner */
+ __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+ }
+
+ return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi64(u64 __user *uaddr1, u64 __user *uaddr2,
+ int nr_wake, int nr_requeue, u64 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state2 = NULL;
+ struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+ struct rt_mutex *lock2 = NULL;
+ int ret, drop_count = 0;
+ unsigned long flags = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+retry:
+ /*
+ * First take all the futex related locks:
+ */
+ down_read(¤t->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr1, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u64 curval;
+
+ ret = get_futex_value_locked64(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * FIRST: get and set the pi_state
+ */
+ if (!pi_state2) {
+ int s;
+ /* do this only the first time we requeue someone */
+ s = lookup_pi_state_for_requeue64(uaddr2, hb2,
+ &key2,
+ &pi_state2);
+ if (s) {
+ ret = s;
+ goto out_unlock;
+ }
+
+ lock2 = &pi_state2->pi_mutex;
+ spin_lock_irqsave(&lock2->wait_lock, flags);
+
+ /* Save the top waiter of the wait_list */
+ if (rt_mutex_has_waiters(lock2))
+ top_waiter = rt_mutex_top_waiter(lock2);
+ } else
+ atomic_inc(&pi_state2->refcount);
+
+
+ this->pi_state = pi_state2;
+
+ /*
+ * SECOND: requeue futex_q to the correct hashbucket
+ */
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+ }
+ this->key = key2;
+ get_key_refs(&key2);
+ drop_count++;
+
+
+ /*
+ * THIRD: queue it to lock2
+ */
+ spin_lock(&this->task->pi_lock);
+ waiter = &this->waiter;
+ waiter->task = this->task;
+ waiter->lock = lock2;
+ plist_node_init(&waiter->list_entry, this->task->prio);
+ plist_node_init(&waiter->pi_list_entry, this->task->prio);
+ plist_add(&waiter->list_entry, &lock2->wait_list);
+ this->task->pi_blocked_on = waiter;
+ spin_unlock(&this->task->pi_lock);
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+ /* If we've requeued some tasks and the top_waiter of the rt_mutex
+ has changed, we must adjust the priority of the owner, if any */
+ if (drop_count) {
+ struct task_struct *owner = rt_mutex_owner(lock2);
+ if (owner &&
+ (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+ int chain_walk = 0;
+
+ spin_lock(&owner->pi_lock);
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ else
+ /*
+ * There was no waiters before the requeue,
+ * the flag must be updated
+ */
+ mark_rt_mutex_waiters(lock2);
+
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ chain_walk = 1;
+ get_task_struct(owner);
+ }
+
+ spin_unlock(&owner->pi_lock);
+ spin_unlock_irqrestore(&lock2->wait_lock, flags);
+
+ if (chain_walk)
+ rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+ current);
+ } else {
+ /* No owner or the top_waiter does not change */
+ mark_rt_mutex_waiters(lock2);
+ spin_unlock_irqrestore(&lock2->wait_lock, flags);
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_key_refs(&key1);
+
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
static int futex_wait64(u64 __user *uaddr, u64 val, struct timespec *time)
{
struct task_struct *curr = current;
@@ -2191,7 +2965,7 @@ static int futex_wait64(u64 __user *uadd
struct futex_q q;
u64 uval;
int ret;
- struct hrtimer_sleeper t;
+ struct hrtimer_sleeper t, *to = NULL;
int rem = 0;
q.pi_state = NULL;
@@ -2245,6 +3019,14 @@ static int futex_wait64(u64 __user *uadd
if (uval != val)
goto out_unlock_release_sem;
+ /*
+ * This rt_mutex_waiter structure is prepared here and will
+ * be used only if this task is requeued from a normal futex to
+ * a PI-futex with futex_requeue_pi.
+ */
+ debug_rt_mutex_init_waiter(&q.waiter);
+ q.waiter.task = NULL;
+
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb);
@@ -2278,7 +3060,7 @@ static int futex_wait64(u64 __user *uadd
if (time->tv_sec == 0 && time->tv_nsec == 0)
schedule();
else {
-
+ to = &t;
hrtimer_init(&t.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
hrtimer_init_sleeper(&t, current);
@@ -2310,6 +3092,67 @@ static int futex_wait64(u64 __user *uadd
* we are the only user of it.
*/
+ if (q.pi_state) {
+ /*
+ * We were woken but have been requeued on a PI-futex.
+ * We have to complete the lock acquisition by taking
+ * the rtmutex.
+ */
+
+ struct rt_mutex *lock = &q.pi_state->pi_mutex;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lock->wait_lock, flags);
+ if (unlikely(q.waiter.task)) {
+ remove_waiter(lock, &q.waiter, flags);
+ }
+ spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (rem)
+ ret = -ETIMEDOUT;
+ else
+ ret = rt_mutex_timed_lock(lock, to, 1);
+
+ down_read(&curr->mm->mmap_sem);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ /*
+ * We MUST play with the futex we were requeued on,
+ * NOT the current futex.
+ * We can retrieve it from the key of the pi_state
+ */
+ uaddr = get_futex_address(&q.pi_state->key);
+
+ /* mmap_sem and hash_bucket lock are unlocked at
+ return of this function */
+ ret = fixup_pi_state_owner64(uaddr, &q, hb, curr);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ up_read(&curr->mm->mmap_sem);
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
+ return ret;
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
@@ -2338,7 +3181,7 @@ static int futex_lock_pi64(u64 __user *u
struct futex_hash_bucket *hb;
u64 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_held, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
@@ -2361,6 +3204,8 @@ static int futex_lock_pi64(u64 __user *u
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ lock_held = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -2379,7 +3224,16 @@ static int futex_lock_pi64(u64 __user *u
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
- ret = -EDEADLK;
+ /*
+ * Normally, this check is done in user space.
+ * In case of requeue, the owner may attempt to lock this futex,
+ * even if the ownership has already been given by the previous
+ * waker.
+ * In the usual case, this is a case of deadlock, but not in case
+ * of REQUEUE_PI.
+ */
+ if (!(curval & FUTEX_WAITER_REQUEUED))
+ ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -2391,7 +3245,18 @@ static int futex_lock_pi64(u64 __user *u
goto out_unlock_release_sem;
uval = curval;
- newval = uval | FUTEX_WAITERS;
+ /*
+ * In case of a requeue, check if there already is an owner
+ * If not, just take the futex.
+ */
+ if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+ /* set current as futex owner */
+ newval = curval | current->pid;
+ lock_held = 1;
+ } else
+ /* Set the WAITERS flag, so the owner will know it has someone
+ to wake at next unlock */
+ newval = curval | FUTEX_WAITERS;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic64(uaddr, uval, newval);
@@ -2402,11 +3267,16 @@ static int futex_lock_pi64(u64 __user *u
if (unlikely(curval != uval))
goto retry_locked;
+ if (lock_held) {
+ set_pi_futex_owner(hb, &q.key, curr);
+ goto out_unlock_release_sem;
+ }
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, &q);
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
if (unlikely(ret)) {
/*
@@ -2469,45 +3339,10 @@ static int futex_lock_pi64(u64 __user *u
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
- if (!ret && q.pi_state->owner != curr) {
- u64 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(¤t->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, ¤t->pi_state_list);
- spin_unlock_irq(¤t->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
- /*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
- */
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic64(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- } else {
+ if (!ret && q.pi_state->owner != curr)
+ /* mmap_sem is unlocked at return of this function */
+ ret = fixup_pi_state_owner64(uaddr, &q, hb, curr);
+ else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
@@ -2518,7 +3353,7 @@ static int futex_lock_pi64(u64 __user *u
ret = 0;
}
/* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
+ unqueue_me_pi(&q);
up_read(&curr->mm->mmap_sem);
}
@@ -2699,6 +3534,9 @@ long do_futex64(u64 __user *uaddr, int o
case FUTEX_TRYLOCK_PI:
ret = futex_lock_pi64(uaddr, 0, timeout, 1);
break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue_pi64(uaddr, uaddr2, val, val2, &val3);
+ break;
default:
ret = -ENOSYS;
}
@@ -2721,7 +3559,8 @@ sys_futex64(u64 __user *uaddr, int op, u
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (unsigned long) utime;
return do_futex64(uaddr, op, val, &t, uaddr2, val2, val3);
@@ -2745,7 +3584,8 @@ asmlinkage long sys_futex(u32 __user *ua
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
Index: linux-2.6/kernel/rtmutex.c
===================================================================
--- linux-2.6.orig/kernel/rtmutex.c 2006-12-18 09:24:09.000000000 +0100
+++ linux-2.6/kernel/rtmutex.c 2006-12-18 09:24:50.000000000 +0100
@@ -56,7 +56,7 @@
* state.
*/
-static void
+void
rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
unsigned long mask)
{
@@ -80,29 +80,6 @@ static void fixup_rt_mutex_waiters(struc
clear_rt_mutex_waiters(lock);
}
-/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
-
- do {
- owner = *p;
- cpu_relax();
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- lock->owner = (struct task_struct *)
- ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
int pi_initialized;
@@ -140,7 +117,7 @@ int rt_mutex_getprio(struct task_struct
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
@@ -176,11 +153,11 @@ int max_lock_depth = 1024;
* Decreases task's usage by one - may thus free the task.
* Returns 0 or -EDEADLK.
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
- struct rt_mutex *orig_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -541,9 +518,9 @@ static void wakeup_next_waiter(struct rt
*
* Must be called with lock->wait_lock held
*/
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- unsigned long flags)
+void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ unsigned long flags)
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
Index: linux-2.6/kernel/rtmutex_common.h
===================================================================
--- linux-2.6.orig/kernel/rtmutex_common.h 2006-12-18 09:14:20.000000000 +0100
+++ linux-2.6/kernel/rtmutex_common.h 2006-12-18 09:24:50.000000000 +0100
@@ -113,6 +113,30 @@ static inline unsigned long rt_mutex_own
}
/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ cpu_relax();
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +144,16 @@ extern void rt_mutex_init_proxy_locked(s
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ unsigned long flags);
#endif
Index: linux-2.6/kernel/futex_compat.c
===================================================================
--- linux-2.6.orig/kernel/futex_compat.c 2006-12-18 09:24:08.000000000 +0100
+++ linux-2.6/kernel/futex_compat.c 2006-12-18 09:24:50.000000000 +0100
@@ -150,7 +150,8 @@ asmlinkage long compat_sys_futex(u32 __u
if (!timespec_valid(&t))
return -EINVAL;
}
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
Thanks,
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2)
2007-01-03 12:32 [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2) Pierre Peiffer
@ 2007-01-03 12:35 ` Ingo Molnar
2007-01-03 14:35 ` Pierre Peiffer
0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2007-01-03 12:35 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Sébastien Dugué, Ulrich Drepper, Darren Hart
* Pierre Peiffer <pierre.peiffer@bull.net> wrote:
> Hi,
>
> First, thanks Ingo for your comments on my previous mail from
> december. I've taken all your remarks into account.
>
> The 64-bit and compat versions have been implemented and tested. The
> glibc part has also been updated and the x86_64 version is now
> implemented too.
>
> Here after is the updated patch for kernel 2.6.19-rt15.
looks good to me in principle. The size of the patch is scary - is there
really no simpler way? Also, could you send me a patch against a
20-rc3-rt0-ish kernel so that i can stick this into -rt for testing?
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2)
2007-01-03 12:35 ` Ingo Molnar
@ 2007-01-03 14:35 ` Pierre Peiffer
2007-01-03 15:56 ` Ingo Molnar
0 siblings, 1 reply; 6+ messages in thread
From: Pierre Peiffer @ 2007-01-03 14:35 UTC (permalink / raw)
To: Ingo Molnar
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Sébastien Dugué, Ulrich Drepper, Darren Hart
Ingo Molnar a écrit :
>
> looks good to me in principle. The size of the patch is scary - is there
> really no simpler way?
Humf, in fact, for the 64-bit part, I've followed the rule of the existing
64-bit code in futex.c, which consists of duplicating all the functions which
can not be kept common, and add a suffix 64 to all duplicated functions.
Perhaps I missed something ?
> Also, could you send me a patch against a
> 20-rc3-rt0-ish kernel so that i can stick this into -rt for testing?
>
Ok, will do that.
Thanks,
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2)
2007-01-03 14:35 ` Pierre Peiffer
@ 2007-01-03 15:56 ` Ingo Molnar
2007-01-04 7:59 ` Pierre Peiffer
0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2007-01-03 15:56 UTC (permalink / raw)
To: Pierre Peiffer
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Sébastien Dugué, Ulrich Drepper, Darren Hart
* Pierre Peiffer <pierre.peiffer@bull.net> wrote:
> Ingo Molnar a écrit :
> >
> >looks good to me in principle. The size of the patch is scary - is there
> >really no simpler way?
>
> Humf, in fact, for the 64-bit part, I've followed the rule of the
> existing 64-bit code in futex.c, which consists of duplicating all the
> functions which can not be kept common, and add a suffix 64 to all
> duplicated functions. Perhaps I missed something ?
i dont think you missed anything - but some consolidation here would be
nice. Only if possible of course :-)
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2)
2007-01-03 15:56 ` Ingo Molnar
@ 2007-01-04 7:59 ` Pierre Peiffer
2007-01-04 8:08 ` Ulrich Drepper
0 siblings, 1 reply; 6+ messages in thread
From: Pierre Peiffer @ 2007-01-04 7:59 UTC (permalink / raw)
To: Ingo Molnar
Cc: LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Sébastien Dugué, Ulrich Drepper, Darren Hart
Ingo Molnar a écrit :
> * Pierre Peiffer <pierre.peiffer@bull.net> wrote:
>
>> Ingo Molnar a écrit :
>>> looks good to me in principle. The size of the patch is scary - is there
>>> really no simpler way?
>> Humf, in fact, for the 64-bit part, I've followed the rule of the
>> existing 64-bit code in futex.c, which consists of duplicating all the
>> functions which can not be kept common, and add a suffix 64 to all
>> duplicated functions. Perhaps I missed something ?
>
> i dont think you missed anything - but some consolidation here would be
> nice. Only if possible of course :-)
Ok ;-)
So, you are not only speaking about "my" part of duplicated code, right ?
But, just for information, what is the sys_futex64 for, exactly ? Is there a
plan to have in the future a 64-bit PID ? Because for now, 32-bits futex is
enough, so... ?
Otherwise, I don't have a "clean" way to avoid this duplication.... :-/
--
Pierre Peiffer
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2)
2007-01-04 7:59 ` Pierre Peiffer
@ 2007-01-04 8:08 ` Ulrich Drepper
0 siblings, 0 replies; 6+ messages in thread
From: Ulrich Drepper @ 2007-01-04 8:08 UTC (permalink / raw)
To: Pierre Peiffer
Cc: Ingo Molnar, LKML, Dinakar Guniguntala, Jean-Pierre Dion,
Sébastien Dugué, Darren Hart
[-- Attachment #1: Type: text/plain, Size: 519 bytes --]
Pierre Peiffer wrote:
> But, just for information, what is the sys_futex64 for, exactly ? Is
> there a plan to have in the future a 64-bit PID ?
This has nothing to do with PIDs. We need 64-bit values for more
complex bit fields which can then be stored in the futex. One example
is a new, much faster rwlock implementation. These are not practical
without 64-bit futexes. This is why I asked Ingo to add the code.
--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2007-01-04 8:12 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-01-03 12:32 [PATCH 2.6.19.1-rt15][RFC] - futex_requeue_pi implementation (requeue from futex1 to PI-futex2) Pierre Peiffer
2007-01-03 12:35 ` Ingo Molnar
2007-01-03 14:35 ` Pierre Peiffer
2007-01-03 15:56 ` Ingo Molnar
2007-01-04 7:59 ` Pierre Peiffer
2007-01-04 8:08 ` Ulrich Drepper
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox