All of lore.kernel.org
 help / color / mirror / Atom feed
From: Darren Hart <dvhltc@us.ibm.com>
To: Darren Hart <dvhltc@us.ibm.com>
Cc: linux-kernel@vger.kernel.org,
	Thomas Gleixner <tglx@linutronix.de>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@elte.hu>,
	Eric Dumazet <eric.dumazet@gmail.com>,
	John Kacur <jkacur@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Mike Galbraith <efault@gmx.de>,
	linux-rt-users@vger.kernel.org
Subject: Re: [PATCH 4/4 V2] futex: convert hash_bucket locks to raw_spinlock_t
Date: Fri, 09 Jul 2010 15:57:09 -0700	[thread overview]
Message-ID: <4C37A945.2000606@us.ibm.com> (raw)
In-Reply-To: <1278714780-788-5-git-send-email-dvhltc@us.ibm.com>

This version pulls in the bits mistakenly left in 3/4.


>From 9f8b4faac79518f98131464c2d21a1c64fb841d2 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 9 Jul 2010 16:44:47 -0400
Subject: [PATCH 4/4 V2] futex: convert hash_bucket locks to raw_spinlock_t

The requeue_pi mechanism introduced proxy locking of the rtmutex.  This creates
a scenario where a task can wake-up, not knowing it has been enqueued on an
rtmutex. In order to detect this, the task would have to be able to take either
task->pi_blocked_on->lock->wait_lock and/or the hb->lock.  Unfortunately,
without already holding one of these, the pi_blocked_on variable can change
from NULL to valid or from valid to NULL. Therefor, the task cannot be allowed
to take a sleeping lock after wakeup or it could end up trying to block on two
locks, the second overwriting a valid pi_blocked_on value. This obviously
breaks the pi mechanism.

This patch increases latency, while running the ltp pthread_cond_many test
which Michal reported the bug with, I see double digit hrtimer latencies
(typically only on the first run after boo):

	kernel: hrtimer: interrupt took 75911 ns

This might be addressed by changing the various loops in the futex code to be
incremental, probably at an additional throughput hit. The private hash_bucket
lists discussed in the past could reduce hb->lock contention in some scenarios.
It should be noted that pthread_cond_many is a rather pathological case.

This also introduces problems for plists which want a spinlock_t rather
than a raw_spinlock_t. Any thoughts on how to address this?

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <efault@gmx.de>
---
 kernel/futex.c |   73 ++++++++++++++++++++++++++++++--------------------------
 1 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2cd58a2..0ad5a85 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -114,7 +114,7 @@ struct futex_q {
 	struct plist_node list;
 
 	struct task_struct *task;
-	spinlock_t *lock_ptr;
+	raw_spinlock_t *lock_ptr;
 	union futex_key key;
 	struct futex_pi_state *pi_state;
 	struct rt_mutex_waiter *rt_waiter;
@@ -128,7 +128,7 @@ struct futex_q {
  * waiting on a futex.
  */
 struct futex_hash_bucket {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct plist_head chain;
 };
 
@@ -479,7 +479,7 @@ void exit_pi_state_list(struct task_struct *curr)
 		hb = hash_futex(&key);
 		raw_spin_unlock_irq(&curr->pi_lock);
 
-		spin_lock(&hb->lock);
+		raw_spin_lock(&hb->lock);
 
 		raw_spin_lock_irq(&curr->pi_lock);
 		/*
@@ -487,7 +487,7 @@ void exit_pi_state_list(struct task_struct *curr)
 		 * task still owns the PI-state:
 		 */
 		if (head->next != next) {
-			spin_unlock(&hb->lock);
+			raw_spin_unlock(&hb->lock);
 			continue;
 		}
 
@@ -499,7 +499,7 @@ void exit_pi_state_list(struct task_struct *curr)
 
 		rt_mutex_unlock(&pi_state->pi_mutex);
 
-		spin_unlock(&hb->lock);
+		raw_spin_unlock(&hb->lock);
 
 		raw_spin_lock_irq(&curr->pi_lock);
 	}
@@ -860,21 +860,21 @@ static inline void
 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
 	if (hb1 <= hb2) {
-		spin_lock(&hb1->lock);
+		raw_spin_lock(&hb1->lock);
 		if (hb1 < hb2)
-			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+			raw_spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
 	} else { /* hb1 > hb2 */
-		spin_lock(&hb2->lock);
-		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+		raw_spin_lock(&hb2->lock);
+		raw_spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
 	}
 }
 
 static inline void
 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
-	spin_unlock(&hb1->lock);
+	raw_spin_unlock(&hb1->lock);
 	if (hb1 != hb2)
-		spin_unlock(&hb2->lock);
+		raw_spin_unlock(&hb2->lock);
 }
 
 /*
@@ -896,7 +896,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 		goto out;
 
 	hb = hash_futex(&key);
-	spin_lock(&hb->lock);
+	raw_spin_lock(&hb->lock);
 	head = &hb->chain;
 
 	plist_for_each_entry_safe(this, next, head, list) {
@@ -916,7 +916,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 		}
 	}
 
-	spin_unlock(&hb->lock);
+	raw_spin_unlock(&hb->lock);
 	put_futex_key(fshared, &key);
 out:
 	return ret;
@@ -1070,6 +1070,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 
 	q->lock_ptr = &hb->lock;
 #ifdef CONFIG_DEBUG_PI_LIST
+	/* FIXME: we're converting this to a raw lock... */
 	q->list.plist.spinlock = &hb->lock;
 #endif
 
@@ -1377,14 +1378,14 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 	hb = hash_futex(&q->key);
 	q->lock_ptr = &hb->lock;
 
-	spin_lock(&hb->lock);
+	raw_spin_lock(&hb->lock);
 	return hb;
 }
 
 static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	spin_unlock(&hb->lock);
+	raw_spin_unlock(&hb->lock);
 	drop_futex_key_refs(&q->key);
 }
 
@@ -1416,11 +1417,12 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 
 	plist_node_init(&q->list, prio);
 #ifdef CONFIG_DEBUG_PI_LIST
+	/* FIXME: we're converting this to a raw_spinlock */
 	q->list.plist.spinlock = &hb->lock;
 #endif
 	plist_add(&q->list, &hb->chain);
 	q->task = current;
-	spin_unlock(&hb->lock);
+	raw_spin_unlock(&hb->lock);
 }
 
 /**
@@ -1436,7 +1438,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
  */
 static int unqueue_me(struct futex_q *q)
 {
-	spinlock_t *lock_ptr;
+	raw_spinlock_t *lock_ptr;
 	int ret = 0;
 
 	/* In the common case we don't take the spinlock, which is nice. */
@@ -1444,7 +1446,7 @@ retry:
 	lock_ptr = q->lock_ptr;
 	barrier();
 	if (lock_ptr != NULL) {
-		spin_lock(lock_ptr);
+		raw_spin_lock(lock_ptr);
 		/*
 		 * q->lock_ptr can change between reading it and
 		 * spin_lock(), causing us to take the wrong lock.  This
@@ -1459,7 +1461,7 @@ retry:
 		 * we can detect whether we acquired the correct lock.
 		 */
 		if (unlikely(lock_ptr != q->lock_ptr)) {
-			spin_unlock(lock_ptr);
+			raw_spin_unlock(lock_ptr);
 			goto retry;
 		}
 		WARN_ON(plist_node_empty(&q->list));
@@ -1467,7 +1469,7 @@ retry:
 
 		BUG_ON(q->pi_state);
 
-		spin_unlock(lock_ptr);
+		raw_spin_unlock(lock_ptr);
 		ret = 1;
 	}
 
@@ -1491,7 +1493,7 @@ static void unqueue_me_pi(struct futex_q *q)
 	pi_state = q->pi_state;
 	q->pi_state = NULL;
 
-	spin_unlock(q->lock_ptr);
+	raw_spin_unlock(q->lock_ptr);
 	drop_futex_key_refs(&q->key);
 
 	free_pi_state(pi_state);
@@ -1579,11 +1581,11 @@ retry:
 	 * simply return.
 	 */
 handle_fault:
-	spin_unlock(q->lock_ptr);
+	raw_spin_unlock(q->lock_ptr);
 
 	ret = fault_in_user_writeable(uaddr);
 
-	spin_lock(q->lock_ptr);
+	raw_spin_lock(q->lock_ptr);
 
 	/*
 	 * Check if someone else fixed it for us:
@@ -1976,7 +1978,7 @@ retry_private:
 		ret = ret ? 0 : -EWOULDBLOCK;
 	}
 
-	spin_lock(q.lock_ptr);
+	raw_spin_lock(q.lock_ptr);
 	/*
 	 * Fixup the pi_state owner and possibly acquire the lock if we
 	 * haven't already.
@@ -2053,7 +2055,7 @@ retry:
 		goto out;
 
 	hb = hash_futex(&key);
-	spin_lock(&hb->lock);
+	raw_spin_lock(&hb->lock);
 
 	/*
 	 * To avoid races, try to do the TID -> 0 atomic transition
@@ -2102,14 +2104,14 @@ retry:
 	}
 
 out_unlock:
-	spin_unlock(&hb->lock);
+	raw_spin_unlock(&hb->lock);
 	put_futex_key(fshared, &key);
 
 out:
 	return ret;
 
 pi_faulted:
-	spin_unlock(&hb->lock);
+	raw_spin_unlock(&hb->lock);
 	put_futex_key(fshared, &key);
 
 	ret = fault_in_user_writeable(uaddr);
@@ -2257,9 +2259,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
 
-	spin_lock(&hb->lock);
+	raw_spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-	spin_unlock(&hb->lock);
+	raw_spin_unlock(&hb->lock);
 	if (ret)
 		goto out_put_keys;
 
@@ -2277,10 +2279,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 		 * did a lock-steal - fix up the PI-state in that case.
 		 */
 		if (q.pi_state && (q.pi_state->owner != current)) {
-			spin_lock(q.lock_ptr);
+			raw_spin_lock(q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current,
 						   fshared);
-			spin_unlock(q.lock_ptr);
+			raw_spin_unlock(q.lock_ptr);
 		}
 	} else {
 		/*
@@ -2293,7 +2295,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 
-		spin_lock(q.lock_ptr);
+		raw_spin_lock(q.lock_ptr);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
 		 * haven't already.
@@ -2668,8 +2670,11 @@ static int __init futex_init(void)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
-		spin_lock_init(&futex_queues[i].lock);
+		/* 
+		 * FIXME: plist wants a spinlock, but the hb->lock is a raw_spinlock_t
+		 */
+		plist_head_init(&futex_queues[i].chain, NULL /*&futex_queues[i].lock*/);
+		raw_spin_lock_init(&futex_queues[i].lock);
 	}
 
 	return 0;
-- 
1.7.0.4

  reply	other threads:[~2010-07-09 22:57 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-07-09 22:32 [PATCH 0/4][RT] futex: fix tasks blocking on two rt_mutex locks Darren Hart
2010-07-09 22:32 ` [PATCH 1/4] rtmutex: avoid null derefence in WARN_ON Darren Hart
2010-07-10  0:29   ` Steven Rostedt
2010-07-10 14:42     ` Darren Hart
2010-07-09 22:32 ` [PATCH 2/4] rtmutex: add BUG_ON if a task attempts to block on two locks Darren Hart
2010-07-10  0:30   ` Steven Rostedt
2010-07-10 17:30     ` [PATCH 2/4 V2] " Darren Hart
2010-07-09 22:32 ` [PATCH 3/4] futex: free_pi_state outside of hb->lock sections Darren Hart
2010-07-09 22:55   ` [PATCH 3/4 V2] " Darren Hart
2010-07-09 22:55     ` Darren Hart
2010-07-10  0:32     ` Steven Rostedt
2010-07-10 14:41       ` Darren Hart
2010-07-12 10:35   ` [PATCH 3/4] " Thomas Gleixner
2010-07-12 10:46     ` Steven Rostedt
2010-07-09 22:33 ` [PATCH 4/4] futex: convert hash_bucket locks to raw_spinlock_t Darren Hart
2010-07-09 22:57   ` Darren Hart [this message]
2010-07-10  0:34     ` [PATCH 4/4 V2] " Steven Rostedt
2010-07-10 19:41   ` [PATCH 4/4] " Mike Galbraith
2010-07-11 13:33     ` Mike Galbraith
2010-07-11 15:10       ` Darren Hart
2010-07-12 11:45       ` Steven Rostedt
2010-07-12 12:12         ` Mike Galbraith
2010-07-12 19:10     ` Darren Hart
2010-07-12 20:40       ` Thomas Gleixner
2010-07-12 20:43         ` Thomas Gleixner
2010-07-13  3:09         ` Mike Galbraith
2010-07-13  7:12           ` Darren Hart
2010-07-12 13:05   ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4C37A945.2000606@us.ibm.com \
    --to=dvhltc@us.ibm.com \
    --cc=efault@gmx.de \
    --cc=eric.dumazet@gmail.com \
    --cc=jkacur@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.