Re: Futex queue_me/get_user ordering

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Jakub Jelinek <jakub@redhat.com>
To: Jamie Lokier <jamie@shareable.org>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>,
	mingo@elte.hu, Andrew Morton <akpm@osdl.org>,
	linux-kernel@vger.kernel.org, rusty@rustcorp.com.au, ahu@ds9a.nl,
	Ulrich Drepper <drepper@redhat.com>,
	Roland McGrath <roland@redhat.com>
Subject: Re: Futex queue_me/get_user ordering
Date: Thu, 17 Mar 2005 05:26:22 -0500	[thread overview]
Message-ID: <20050317102619.GA23494@devserv.devel.redhat.com> (raw)
In-Reply-To: <20041118194726.GX10340@devserv.devel.redhat.com>

On Thu, Nov 18, 2004 at 02:47:26PM -0500, Jakub Jelinek wrote:
> The scenario described in futex_wait-fix.patch IMHO can happen even
> if all calls to pthread_cond_signal are done with mutex held around it, i.e.
> A		B		X		Y
> pthread_mutex_lock (&mtx);
> pthread_cond_wait (&cv, &mtx);
>   - mtx release *)
>   total++ [1/0/0] (0) {}
> 				pthread_mutex_lock (&mtx);
> 				pthread_cond_signal (&cv);
> 				  - wake++ [1/1/0] (1) {}
> 				  FUTEX_WAKE, 1 (returns, nothing is queued)
> 				pthread_mutex_unlock (&mtx);
> 		pthread_mutex_lock (&mtx);
> 		pthread_cond_wait (&cv, &mtx);
> 		  - mtx release *)
> 		  total++ [2/1/0] (1) {}
>   FUTEX_WAIT, 0
>   queue_me [2/1/0] (1) {A}
>   0 != 1
> 		FUTEX_WAIT, 1
> 		queue_me [2/1/0] (1) {A,B}
> 		1 == 1
> 						pthread_mutex_lock (&mtx);
> 						pthread_cond_signal (&cv);
> 						  - wake++ [2/2/0] (2) {A,B}
> 						  FUTEX_WAKE, 1 (unqueues incorrectly A)
> 						  [2/2/0] (2) {B}
> 						pthread_mutex_unlock (&mtx);
>   try to dequeue but already dequeued
>   would normally return EWOULDBLOCK here
>   but as unqueue_me failed, returns 0
>   woken++ [2/2/1] (2) {B}
> 		schedule_timeout (forever)
>   - mtx reacquire
>   pthread_cond_wait returns
> pthread_mutex_unlock (&mtx);
> 
> 		-------------------
> 		the code would like to say pthread_mutex_unlock (&mtx);
> 		and pthread_exit here, but never reaches there.
...

http://www.ussg.iu.edu/hypermail/linux/kernel/0411.2/0953.html

Your argument in November was that you don't want to slow down the
kernel and that userland must be able to cope with the
non-atomicity of futex syscall.

But with the recent changes to futex.c I think kernel can ensure
atomicity for free.

With get_futex_value_locked doing the user access in_atomic () and
repeating if that failed, I think it would be just a matter of
something as in the patch below (totally untested though).
It would simplify requeue implementation (getting rid of the nqueued
field), as well as never enqueue a futex in futex_wait until
the *uaddr == val uaccess check has shown it should be enqueued.
And I don't think the kernel will be any slower because of that,
in the common case where get_futex_value_locked does not cause
a mm fault (userland typically accessed that memory a few cycles before
the syscall), the futex_wait change is just about doing first half of
queue_me before the user access and second half after it.

--- linux-2.6.11/kernel/futex.c.jj	2005-03-17 04:42:29.000000000 -0500
+++ linux-2.6.11/kernel/futex.c	2005-03-17 05:13:45.000000000 -0500
@@ -97,7 +97,6 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
-       unsigned int	    nqueued;
        struct list_head       chain;
 };
 
@@ -265,7 +264,6 @@ static inline int get_futex_value_locked
 	inc_preempt_count();
 	ret = __copy_from_user_inatomic(dest, from, sizeof(int));
 	dec_preempt_count();
-	preempt_check_resched();
 
 	return ret ? -EFAULT : 0;
 }
@@ -339,7 +337,6 @@ static int futex_requeue(unsigned long u
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
-	unsigned int nqueued;
 
  retry:
 	down_read(&current->mm->mmap_sem);
@@ -354,23 +351,24 @@ static int futex_requeue(unsigned long u
 	bh1 = hash_futex(&key1);
 	bh2 = hash_futex(&key2);
 
-	nqueued = bh1->nqueued;
+	if (bh1 < bh2)
+		spin_lock(&bh1->lock);
+	spin_lock(&bh2->lock);
+	if (bh1 > bh2)
+		spin_lock(&bh1->lock);
+
 	if (likely(valp != NULL)) {
 		int curval;
 
-		/* In order to avoid doing get_user while
-		   holding bh1->lock and bh2->lock, nqueued
-		   (monotonically increasing field) must be first
-		   read, then *uaddr1 fetched from userland and
-		   after acquiring lock nqueued field compared with
-		   the stored value.  The smp_mb () below
-		   makes sure that bh1->nqueued is read from memory
-		   before *uaddr1.  */
-		smp_mb();
-
 		ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
 
 		if (unlikely(ret)) {
+			spin_unlock(&bh1->lock);
+			if (bh1 != bh2)
+				spin_unlock(&bh2->lock);
+
+			preempt_check_resched();
+
 			/* If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
@@ -385,21 +383,10 @@ static int futex_requeue(unsigned long u
 		}
 		if (curval != *valp) {
 			ret = -EAGAIN;
-			goto out;
+			goto out_unlock;
 		}
 	}
 
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
-
-	if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-
 	head1 = &bh1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
@@ -435,13 +422,9 @@ out:
 	return ret;
 }
 
-/*
- * queue_me and unqueue_me must be called as a pair, each
- * exactly once.  They are called with the hashed spinlock held.
- */
-
 /* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *
+queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
 	struct futex_hash_bucket *bh;
 
@@ -455,11 +438,35 @@ static void queue_me(struct futex_q *q, 
 	q->lock_ptr = &bh->lock;
 
 	spin_lock(&bh->lock);
-	bh->nqueued++;
+	return bh;
+}
+
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+{
 	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
 
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+	spin_unlock(&bh->lock);
+	drop_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once.  They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static void queue_me(struct futex_q *q, int fd, struct file *filp)
+{
+	struct futex_hash_bucket *bh;
+	bh = queue_lock(q, fd, filp);
+	__queue_me(q, bh);
+}
+
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
@@ -503,6 +510,7 @@ static int futex_wait(unsigned long uadd
 	DECLARE_WAITQUEUE(wait, current);
 	int ret, curval;
 	struct futex_q q;
+	struct futex_hash_bucket *bh;
 
  retry:
 	down_read(&current->mm->mmap_sem);
@@ -511,7 +519,7 @@ static int futex_wait(unsigned long uadd
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	queue_me(&q, -1, NULL);
+	bh = queue_lock(&q, -1, NULL);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -537,14 +545,15 @@ static int futex_wait(unsigned long uadd
 	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
 
 	if (unlikely(ret)) {
+		queue_unlock(&q, bh);
+
+		preempt_check_resched();
+
 		/* If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
 		up_read(&current->mm->mmap_sem);
 
-		if (!unqueue_me(&q)) /* There's a chance we got woken already */
-			return 0;
-
 		ret = get_user(curval, (int __user *)uaddr);
 
 		if (!ret)
@@ -553,9 +562,15 @@ static int futex_wait(unsigned long uadd
 	}
 	if (curval != val) {
 		ret = -EWOULDBLOCK;
-		goto out_unqueue;
+		queue_unlock(&q, bh);
+		preempt_check_resched();
+		goto out_release_sem;
 	}
 
+	/* Only actually queue if *uaddr contained val.  */
+	__queue_me(&q, bh);
+	preempt_check_resched();
+
 	/*
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.


	Jakub

next prev parent reply	other threads:[~2005-03-17 11:30 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20041113164048.2f31a8dd.akpm@osdl.org>
2004-11-14  9:00 ` Futex queue_me/get_user ordering (was: 2.6.10-rc1-mm5 [u]) Emergency Services Jamie Lokier
2004-11-14  9:09   ` Andrew Morton
2004-11-14  9:23     ` Jamie Lokier
2004-11-14  9:50       ` bert hubert
2004-11-15 14:12         ` Jamie Lokier
2004-11-16  8:30           ` Futex queue_me/get_user ordering Hidetoshi Seto
2004-11-16 14:58             ` Jamie Lokier
2004-11-18  1:29               ` Hidetoshi Seto
2004-11-15  0:58       ` Hidetoshi Seto
2004-11-15  2:01         ` Jamie Lokier
2004-11-15  3:06           ` Hidetoshi Seto
2004-11-15 13:22             ` Jamie Lokier
2004-11-17  8:47               ` Jakub Jelinek
2004-11-18  2:10                 ` Hidetoshi Seto
2004-11-18  7:20                 ` Jamie Lokier
2004-11-18 19:47                   ` Jakub Jelinek
2005-03-17 10:26                     ` Jakub Jelinek [this message]
2005-03-17 15:20                       ` Jamie Lokier
2005-03-17 15:55                         ` Jakub Jelinek
2005-03-18 17:00                           ` Ingo Molnar
2005-03-21  2:55                             ` Jamie Lokier
2005-03-18 16:53                         ` Jakub Jelinek
2004-11-26 17:06                 ` Jamie Lokier
2004-11-28 17:36                   ` Joe Seigh
2004-11-29 11:24                   ` Jakub Jelinek
2004-11-29 21:50                     ` Jamie Lokier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050317102619.GA23494@devserv.devel.redhat.com \
    --to=jakub@redhat.com \
    --cc=ahu@ds9a.nl \
    --cc=akpm@osdl.org \
    --cc=drepper@redhat.com \
    --cc=jamie@shareable.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=roland@redhat.com \
    --cc=rusty@rustcorp.com.au \
    --cc=seto.hidetoshi@jp.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).