public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Add FUTEX_CMP_REQUEUE futex op
@ 2004-05-20  9:38 Jakub Jelinek
  2004-05-20 22:52 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Jakub Jelinek @ 2004-05-20  9:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: mingo

Hi!

FUTEX_REQUEUE operation has been added to the kernel mainly to improve
pthread_cond_broadcast which previously used FUTEX_WAKE INT_MAX op.
pthread_cond_broadcast releases internal condvar mutex before FUTEX_REQUEUE
operation, as otherwise the woken up thread most likely immediately sleeps
again on the internal condvar mutex until the broadcasting thread releases
it.
Unfortunately this is racy and causes e.g.
http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/nptl/tst-cond16.c?rev=1.1&content-type=text/x-cvsweb-markup&cvsroot=glibc
to hang on SMP.
http://listman.redhat.com/archives/phil-list/2004-May/msg00023.html
contains analysis how the hang happens, the problem is if any thread
does pthread_cond_*wait in between releasing of the internal condvar
mutex and FUTEX_REQUEUE operation, a wrong thread might be awaken (and
immediately go to sleep again because it doesn't satisfy conditions for
returning from pthread_cond_*wait) while the right thread requeued on
the associated mutex and there would be nobody to wake that thread up.

The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT
already uses:
FUTEX_CMP_REQUEUE is passed an additional argument which is the expected
value of *futex.  Kernel then while holding the futex locks checks if
*futex != expected and returns -EAGAIN in that case, while if it is equal,
continues with a normal FUTEX_REQUEUE operation.
If the syscall returns -EAGAIN, NPTL can fall back to FUTEX_WAKE INT_MAX
operation which doesn't have this problem, but is less efficient, while
in the likely case that nobody hit the (small) window the efficient
FUTEX_REQUEUE operation is used.

--- linux-2.6.5/include/linux/futex.h.jj	2004-04-04 05:37:36.000000000 +0200
+++ linux-2.6.5/include/linux/futex.h	2004-05-05 09:57:09.200306101 +0200
@@ -8,9 +8,10 @@
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
 #define FUTEX_REQUEUE (3)
-
+#define FUTEX_CMP_REQUEUE (4)
 
 long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2);
+		unsigned long timeout, unsigned long uaddr2, int val2,
+		int val3);
 
 #endif
--- linux-2.6.5/kernel/futex.c.jj	2004-04-04 05:36:52.000000000 +0200
+++ linux-2.6.5/kernel/futex.c	2004-05-05 12:23:33.481048623 +0200
@@ -96,6 +96,7 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
+       unsigned int	    nqueued;
        struct list_head       chain;
 };
 
@@ -318,13 +319,14 @@ out:
  * physical page.
  */
 static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-				int nr_wake, int nr_requeue)
+			 int nr_wake, int nr_requeue, int *valp)
 {
 	union futex_key key1, key2;
 	struct futex_hash_bucket *bh1, *bh2;
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
+	unsigned int nqueued;
 
 	down_read(&current->mm->mmap_sem);
 
@@ -338,12 +340,33 @@ static int futex_requeue(unsigned long u
 	bh1 = hash_futex(&key1);
 	bh2 = hash_futex(&key2);
 
+	nqueued = bh1->nqueued;
+	if (likely (valp != NULL)) {
+		int curval;
+
+		smp_mb ();
+
+		if (get_user(curval, (int *)uaddr1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (curval != *valp) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
 	if (bh1 < bh2)
 		spin_lock(&bh1->lock);
 	spin_lock(&bh2->lock);
 	if (bh1 > bh2)
 		spin_lock(&bh1->lock);
 
+	if (unlikely (nqueued != bh1->nqueued && valp != NULL)) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
 	head1 = &bh1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
@@ -365,6 +388,7 @@ static int futex_requeue(unsigned long u
 		}
 	}
 
+out_unlock:
 	spin_unlock(&bh1->lock);
 	if (bh1 != bh2)
 		spin_unlock(&bh2->lock);
@@ -398,6 +422,7 @@ static void queue_me(struct futex_q *q, 
 	q->lock_ptr = &bh->lock;
 
 	spin_lock(&bh->lock);
+	bh->nqueued++;
 	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
@@ -625,7 +650,7 @@ out:
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2)
+		unsigned long uaddr2, int val2, int val3)
 {
 	int ret;
 
@@ -641,7 +666,10 @@ long do_futex(unsigned long uaddr, int o
 		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2);
+		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		break;
+	case FUTEX_CMP_REQUEUE:
+		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -651,7 +679,8 @@ long do_futex(unsigned long uaddr, int o
 
 
 asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
-			  struct timespec __user *utime, u32 __user *uaddr2)
+			  struct timespec __user *utime, u32 __user *uaddr2,
+			  int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -665,11 +694,11 @@ asmlinkage long sys_futex(u32 __user *ua
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 
 static struct super_block *
--- linux-2.6.5/kernel/compat.c.jj	2004-04-04 05:37:07.000000000 +0200
+++ linux-2.6.5/kernel/compat.c	2004-05-05 09:56:36.761119626 +0200
@@ -208,7 +208,7 @@ asmlinkage long compat_sys_sigprocmask(i
 
 #ifdef CONFIG_FUTEX
 asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
-		struct compat_timespec *utime, u32 *uaddr2)
+		struct compat_timespec *utime, u32 *uaddr2, int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -219,11 +219,11 @@ asmlinkage long compat_sys_futex(u32 *ua
 			return -EFAULT;
 		timeout = timespec_to_jiffies(&t) + 1;
 	}
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 #endif
 

	Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-20  9:38 [PATCH] Add FUTEX_CMP_REQUEUE futex op Jakub Jelinek
@ 2004-05-20 22:52 ` Andrew Morton
  2004-05-21  6:06   ` Ulrich Drepper
  2004-05-21  7:05   ` Ingo Molnar
  2004-05-21 23:05 ` Andrew Morton
  2004-05-29  3:13 ` Rusty Russell
  2 siblings, 2 replies; 20+ messages in thread
From: Andrew Morton @ 2004-05-20 22:52 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: linux-kernel, mingo

Jakub Jelinek <jakub@redhat.com> wrote:
>
>  asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
> -			  struct timespec __user *utime, u32 __user *uaddr2)
> +			  struct timespec __user *utime, u32 __user *uaddr2,
> +			  int val3)

Is it safe to go adding a new argument to an existing syscall in this manner?

It'll work OK on x86 because of the stack layout but is the same true of
all other supported architectures?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-20 22:52 ` Andrew Morton
@ 2004-05-21  6:06   ` Ulrich Drepper
  2004-05-21  6:36     ` Andrew Morton
  2004-05-21  7:05   ` Ingo Molnar
  1 sibling, 1 reply; 20+ messages in thread
From: Ulrich Drepper @ 2004-05-21  6:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jakub Jelinek, linux-kernel, mingo

Andrew Morton wrote:

> Is it safe to go adding a new argument to an existing syscall in this manner?

Yes.  This is a multiplexed syscall and the opcode decides which syscall
parameter is used.


> It'll work OK on x86 because of the stack layout but is the same true of
> all other supported architectures?

We add parameters at the end.  This does not influence how previous
values are passed.  And especially for syscalls it makes no difference.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-21  6:06   ` Ulrich Drepper
@ 2004-05-21  6:36     ` Andrew Morton
  2004-05-21  7:15       ` Ulrich Drepper
  2004-05-21  7:43       ` Jakub Jelinek
  0 siblings, 2 replies; 20+ messages in thread
From: Andrew Morton @ 2004-05-21  6:36 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: jakub, linux-kernel, mingo

Ulrich Drepper <drepper@redhat.com> wrote:
>
> Andrew Morton wrote:
> 
> > Is it safe to go adding a new argument to an existing syscall in this manner?
> 
> Yes.  This is a multiplexed syscall and the opcode decides which syscall
> parameter is used.
> 

Of course.

> > It'll work OK on x86 because of the stack layout but is the same true of
> > all other supported architectures?
> 
> We add parameters at the end.  This does not influence how previous
> values are passed.  And especially for syscalls it makes no difference.
> 

what we're effectively doing is:

void foo(int a, int b, int c)
{
}

and from another compilation unit:

	foo(a, b);

and we're expecting the a's and b's to line up across all architectures and
compiler options.  I thought that on some architectures that only works out
if the function has a vararg declaration.

Does it do the right thing on stack-grows-up machines?

If the compiler passes the first few args via registers and the rest on the
stack, are we sure that it won't at some level of complexity decide to pass
_all_ the args on the stack?  It's free to do so, I think.

I have a vague memory of getting bitten by this trick once...

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-20 22:52 ` Andrew Morton
  2004-05-21  6:06   ` Ulrich Drepper
@ 2004-05-21  7:05   ` Ingo Molnar
  1 sibling, 0 replies; 20+ messages in thread
From: Ingo Molnar @ 2004-05-21  7:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jakub Jelinek, linux-kernel


On Thu, 20 May 2004, Andrew Morton wrote:

> >  asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
> > -			  struct timespec __user *utime, u32 __user *uaddr2)
> > +			  struct timespec __user *utime, u32 __user *uaddr2,
> > +			  int val3)
> 
> Is it safe to go adding a new argument to an existing syscall in this manner?
> 
> It'll work OK on x86 because of the stack layout but is the same true of
> all other supported architectures?

we added a new futex paramater once before (the original requeue patch) -
so if it broke anything, it didnt break loud enough for anyone to notice
:-|

	Ingo

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-21  6:36     ` Andrew Morton
@ 2004-05-21  7:15       ` Ulrich Drepper
  2004-05-21  7:43       ` Jakub Jelinek
  1 sibling, 0 replies; 20+ messages in thread
From: Ulrich Drepper @ 2004-05-21  7:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: jakub, linux-kernel, mingo

Andrew Morton wrote:

> and we're expecting the a's and b's to line up across all architectures and
> compiler options.  I thought that on some architectures that only works out
> if the function has a vararg declaration.

I never heard that.


> Does it do the right thing on stack-grows-up machines?

Would be only HP/PA and I don't see this to be a problem.


> If the compiler passes the first few args via registers and the rest on the
> stack, are we sure that it won't at some level of complexity decide to pass
> _all_ the args on the stack?  It's free to do so, I think.

This is not how the calling conventions are designed.  If registers are
used they happens unconditional of the remainder of the parameter list.
 The stack is used as an overflow.


> I have a vague memory of getting bitten by this trick once...

I don't and, as Ingo mentioned, we already did it before.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-21  6:36     ` Andrew Morton
  2004-05-21  7:15       ` Ulrich Drepper
@ 2004-05-21  7:43       ` Jakub Jelinek
  2004-05-22 16:58         ` Arnd Bergmann
  1 sibling, 1 reply; 20+ messages in thread
From: Jakub Jelinek @ 2004-05-21  7:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ulrich Drepper, linux-kernel, mingo

On Thu, May 20, 2004 at 11:36:39PM -0700, Andrew Morton wrote:
> > > It'll work OK on x86 because of the stack layout but is the same true of
> > > all other supported architectures?
> > 
> > We add parameters at the end.  This does not influence how previous
> > values are passed.  And especially for syscalls it makes no difference.
> > 
> 
> what we're effectively doing is:
> 
> void foo(int a, int b, int c)
> {
> }
> 
> and from another compilation unit:
> 
> 	foo(a, b);
> 
> and we're expecting the a's and b's to line up across all architectures and
> compiler options.  I thought that on some architectures that only works out
> if the function has a vararg declaration.

The kernel syscall ABI is on many arches different from the compiler ABI.
Adding an argument this way certainly works on the architectures I'm
familiar with (i386, x86_64, ia64, ppc, ppc64, s390, s390x, sparc, sparc64,
alpha).  I believe arm will work too, don't keep track of the rest of
arches.

Well, for s390/s390x there is a problem that it doesn't allow (yet) 6
argument syscalls at all, so one possibility for s390* is adding a wrapper around
sys_futex which will take the 5th and 6th arguments for FUTEX_CMP_REQUEUE
from a structure pointed to by 5th argument and pass that to sys_futex.

If some weirdo arch has problems with this, it can certainly deal with it in
its architecture wrapper.

	Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-20  9:38 [PATCH] Add FUTEX_CMP_REQUEUE futex op Jakub Jelinek
  2004-05-20 22:52 ` Andrew Morton
@ 2004-05-21 23:05 ` Andrew Morton
  2004-05-22 10:10   ` Ingo Oeser
  2004-05-23 17:33   ` Jakub Jelinek
  2004-05-29  3:13 ` Rusty Russell
  2 siblings, 2 replies; 20+ messages in thread
From: Andrew Morton @ 2004-05-21 23:05 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: linux-kernel, mingo

Jakub Jelinek <jakub@redhat.com> wrote:
>
> The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT
> already uses:
> FUTEX_CMP_REQUEUE is passed an additional argument which is the expected
> value of *futex.  Kernel then while holding the futex locks checks if

                                      ^^^^^^^^^^^^^^^^^^^^^^^

But in your patch the check is happening _prior_ to taking the futex locks.

> *futex != expected and returns -EAGAIN in that case, while if it is equal,
> continues with a normal FUTEX_REQUEUE operation.
> If the syscall returns -EAGAIN, NPTL can fall back to FUTEX_WAKE INT_MAX
> operation which doesn't have this problem, but is less efficient, while
> in the likely case that nobody hit the (small) window the efficient
> FUTEX_REQUEUE operation is used.

You've added an smp_mb().  These things are becoming like lock_kernel() -
hard for the reader to discern what it's protecting against.

Please always include a comment when adding a barrier like this.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-21 23:05 ` Andrew Morton
@ 2004-05-22 10:10   ` Ingo Oeser
  2004-05-23 17:33   ` Jakub Jelinek
  1 sibling, 0 replies; 20+ messages in thread
From: Ingo Oeser @ 2004-05-22 10:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton, Jakub Jelinek, mingo

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On Saturday 22 May 2004 01:05, Andrew Morton wrote:
> You've added an smp_mb().  These things are becoming like lock_kernel() -
> hard for the reader to discern what it's protecting against.
> 
> Please always include a comment when adding a barrier like this.

What about including this in the API?

sth. like 

#define smp_mb(why) do { \
	static __debugdata__ char __lock_reason[] = why; \
	} while(0)


would be sufficient. __debugdata__ section
will then be stripped from compressed image but not vmlinux.


Is there another way to force developers to comment? ;-)


Regards

Ingo Oeser

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (GNU/Linux)

iD8DBQFArycyU56oYWuOrkARAljxAKDNE04H2/zr+rZcu1SAR3x19rUtHgCgwqKV
+Mpszd42hQ7hDQjphDobNRk=
=gh0g
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-21  7:43       ` Jakub Jelinek
@ 2004-05-22 16:58         ` Arnd Bergmann
  2004-05-24  7:34           ` Jakub Jelinek
  0 siblings, 1 reply; 20+ messages in thread
From: Arnd Bergmann @ 2004-05-22 16:58 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andrew Morton, Ulrich Drepper, linux-kernel, mingo,
	Martin Schwidefsky

[-- Attachment #1: Type: text/plain, Size: 6446 bytes --]

On Friday 21 May 2004 09:43, Jakub Jelinek wrote:
> Well, for s390/s390x there is a problem that it doesn't allow (yet) 6
> argument syscalls at all, so one possibility for s390* is adding a wrapper around
> sys_futex which will take the 5th and 6th arguments for FUTEX_CMP_REQUEUE
> from a structure pointed to by 5th argument and pass that to sys_futex.

I would really prefer not re-inventing brain-damage like ipc_kludge. I
have tried to do a special s390_futex_cmp_requeue() syscall, which is
still somewhat ugly. At least it has the advantage that it does not break
the de-facto calling conventions for s390 syscalls that either pass
up to five arguments in r2 to r6 or everything in an array pointed to
by r2.

Martin and Jakub, does the patch below look ok to you or should we do
it in yet another way?

	Arnd <><

===== arch/s390/kernel/compat_linux.c 1.21 vs edited =====
--- 1.21/arch/s390/kernel/compat_linux.c	Sat May 22 06:31:44 2004
+++ edited/arch/s390/kernel/compat_linux.c	Sat May 22 17:34:21 2004
@@ -58,6 +58,7 @@
 #include <linux/compat.h>
 #include <linux/vfs.h>
 #include <linux/ptrace.h>
+#include <linux/futex.h>
 
 #include <asm/types.h>
 #include <asm/ipc.h>
@@ -1262,4 +1263,16 @@
 		ret = put_user (ktimer_id, timer_id);
 
 	return ret;
+}
+
+extern asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
+                struct compat_timespec *utime, u32 *uaddr2, int val3);
+
+asmlinkage long compat_s390_futex(u32 *uaddr, int op, int val,
+                struct compat_timespec *utime, u32 *uaddr2, int val3)
+{
+	if (op == FUTEX_CMP_REQUEUE)
+		return -EINVAL;
+
+	return compat_sys_futex(uaddr, op, val, utime, uaddr2, 0);
 }
===== arch/s390/kernel/compat_wrapper.S 1.13 vs edited =====
--- 1.13/arch/s390/kernel/compat_wrapper.S	Sat May 22 06:31:44 2004
+++ edited/arch/s390/kernel/compat_wrapper.S	Sat May 22 18:16:33 2004
@@ -1090,14 +1090,14 @@
 	llgtr	%r3,%r3			# struct stat64 *
 	jg	sys32_fstat64		# branch to system call
 
-	.globl  compat_sys_futex_wrapper 
-compat_sys_futex_wrapper:
+	.globl  compat_s390_futex_wrapper
+compat_s390_futex_wrapper:
 	llgtr	%r2,%r2			# u32 *
 	lgfr	%r3,%r3			# int
 	lgfr	%r4,%r4			# int
 	llgtr	%r5,%r5			# struct compat_timespec *
 	llgtr	%r6,%r6			# u32 *
-	jg	compat_sys_futex	# branch to system call
+	jg	compat_s390_futex	# branch to system call
 
 	.globl	sys32_setxattr_wrapper
 sys32_setxattr_wrapper:
@@ -1404,3 +1404,12 @@
 	llgtr	%r3,%r3			# struct compat_mq_attr *
 	llgtr	%r4,%r4			# struct compat_mq_attr *
 	jg	compat_sys_mq_getsetattr
+
+	.globl	compat_s390_futex_cmp_requeue_wrapper
+compat_s390_futex_cmp_requeue_wrapper:
+	llgtr	%r2,%r2			# u32 *
+	llgtr	%r3,%r3			# u32 *
+	lgfr	%r4,%r4			# int
+	lgfr	%r5,%r5			# int
+	lgfr	%r6,%r6			# int
+	jg	s390_futex_cmp_requeue
===== arch/s390/kernel/sys_s390.c 1.14 vs edited =====
--- 1.14/arch/s390/kernel/sys_s390.c	Sat May 22 06:31:44 2004
+++ edited/arch/s390/kernel/sys_s390.c	Sat May 22 17:31:57 2004
@@ -26,9 +26,8 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#ifdef CONFIG_ARCH_S390X
 #include <linux/personality.h>
-#endif /* CONFIG_ARCH_S390X */
+#include <linux/futex.h>
 
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
@@ -265,3 +264,20 @@
 	return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
 }
 
+asmlinkage long
+s390_futex(u32 __user *uaddr, int op, int val,
+             struct timespec __user *utime, u32 __user *uaddr2)
+{
+	if (op == FUTEX_CMP_REQUEUE)
+		return -EINVAL;
+
+	return sys_futex(uaddr, op, val, utime, uaddr2, 0);
+}
+
+asmlinkage long
+s390_futex_cmp_requeue(u32 __user *uaddr, u32 __user *uaddr2,
+		int nr_wake, int nr_requeue, int val)
+{
+	return do_futex((unsigned long) uaddr, FUTEX_CMP_REQUEUE,
+			nr_wake, 0, (unsigned long)uaddr2, nr_requeue, val);
+}
===== arch/s390/kernel/syscalls.S 1.12 vs edited =====
--- 1.12/arch/s390/kernel/syscalls.S	Sat May 22 06:34:19 2004
+++ edited/arch/s390/kernel/syscalls.S	Sat May 22 18:15:24 2004
@@ -246,7 +246,7 @@
 SYSCALL(sys_fremovexattr,sys_fremovexattr,sys32_fremovexattr_wrapper)	/* 235 */
 SYSCALL(sys_gettid,sys_gettid,sys_gettid)
 SYSCALL(sys_tkill,sys_tkill,sys_tkill)
-SYSCALL(sys_futex,sys_futex,compat_sys_futex_wrapper)
+SYSCALL(s390_futex,s390_futex,compat_s390_futex_wrapper)
 SYSCALL(sys_sched_setaffinity,sys_sched_setaffinity,sys32_sched_setaffinity_wrapper)
 SYSCALL(sys_sched_getaffinity,sys_sched_getaffinity,sys32_sched_getaffinity_wrapper)	/* 240 */
 SYSCALL(sys_tgkill,sys_tgkill,sys_tgkill)
@@ -285,3 +285,4 @@
 SYSCALL(sys_mq_timedreceive,sys_mq_timedreceive,compat_sys_mq_timedreceive_wrapper)
 SYSCALL(sys_mq_notify,sys_mq_notify,compat_sys_mq_notify_wrapper)
 SYSCALL(sys_mq_getsetattr,sys_mq_getsetattr,compat_sys_mq_getsetattr_wrapper)
+SYSCALL(s390_futex_cmp_requeue,s390_futex_cmp_requeue,compat_s390_futex_cmp_requeue_wrapper)
===== include/asm-s390/unistd.h 1.26 vs edited =====
--- 1.26/include/asm-s390/unistd.h	Sat May 22 06:34:45 2004
+++ edited/include/asm-s390/unistd.h	Sat May 22 17:08:01 2004
@@ -269,8 +269,9 @@
 #define __NR_mq_timedreceive	274
 #define __NR_mq_notify		275
 #define __NR_mq_getsetattr	276
+#define __NR_futex_cmp_requeue	277
 
-#define NR_syscalls 277
+#define NR_syscalls 278
 
 /* 
  * There are some system calls that are not present on 64 bit, some
===== include/linux/syscalls.h 1.6 vs edited =====
--- 1.6/include/linux/syscalls.h	Sat May 22 06:31:47 2004
+++ edited/include/linux/syscalls.h	Sat May 22 17:32:58 2004
@@ -165,7 +165,8 @@
 asmlinkage long sys_waitpid(pid_t pid, unsigned int __user *stat_addr, int options);
 asmlinkage long sys_set_tid_address(int __user *tidptr);
 asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
-			struct timespec __user *utime, u32 __user *uaddr2);
+			struct timespec __user *utime, u32 __user *uaddr2,
+			int val3);
 
 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 				const char __user *uargs);
===== kernel/fork.c 1.169 vs edited =====
--- 1.169/kernel/fork.c	Sat May 22 06:32:19 2004
+++ edited/kernel/fork.c	Sat May 22 17:35:46 2004
@@ -516,7 +516,7 @@
 		 * not set up a proper pointer then tough luck.
 		 */
 		put_user(0, tidptr);
-		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
+		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
 	}
 }
 

[-- Attachment #2: signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-21 23:05 ` Andrew Morton
  2004-05-22 10:10   ` Ingo Oeser
@ 2004-05-23 17:33   ` Jakub Jelinek
  1 sibling, 0 replies; 20+ messages in thread
From: Jakub Jelinek @ 2004-05-23 17:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, mingo

On Fri, May 21, 2004 at 04:05:10PM -0700, Andrew Morton wrote:
> Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT
> > already uses:
> > FUTEX_CMP_REQUEUE is passed an additional argument which is the expected
> > value of *futex.  Kernel then while holding the futex locks checks if
> 
>                                       ^^^^^^^^^^^^^^^^^^^^^^^
> 
> But in your patch the check is happening _prior_ to taking the futex locks.

Sure, that's because I'm trying to avoid get_user while holding spinlock, as
get_user may sleep.

> You've added an smp_mb().  These things are becoming like lock_kernel() -
> hard for the reader to discern what it's protecting against.
> 
> Please always include a comment when adding a barrier like this.

Comment added below.

--- linux-2.6.5/include/linux/futex.h.jj	2004-04-04 05:37:36.000000000 +0200
+++ linux-2.6.5/include/linux/futex.h	2004-05-05 09:57:09.200306101 +0200
@@ -8,9 +8,10 @@
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
 #define FUTEX_REQUEUE (3)
-
+#define FUTEX_CMP_REQUEUE (4)
 
 long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2);
+		unsigned long timeout, unsigned long uaddr2, int val2,
+		int val3);
 
 #endif
--- linux-2.6.5/kernel/futex.c.jj	2004-04-04 05:36:52.000000000 +0200
+++ linux-2.6.5/kernel/futex.c	2004-05-05 12:23:33.481048623 +0200
@@ -96,6 +96,7 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
+       unsigned int	    nqueued;
        struct list_head       chain;
 };
 
@@ -318,13 +319,14 @@ out:
  * physical page.
  */
 static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-				int nr_wake, int nr_requeue)
+			 int nr_wake, int nr_requeue, int *valp)
 {
 	union futex_key key1, key2;
 	struct futex_hash_bucket *bh1, *bh2;
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
+	unsigned int nqueued;
 
 	down_read(&current->mm->mmap_sem);
 
@@ -338,12 +340,41 @@ static int futex_requeue(unsigned long u
 	bh1 = hash_futex(&key1);
 	bh2 = hash_futex(&key2);
 
+	nqueued = bh1->nqueued;
+	if (likely (valp != NULL)) {
+		int curval;
+
+		/* In order to avoid doing get_user while
+		   holding bh1->lock and bh2->lock, nqueued
+		   (monotonically increasing field) must be first
+		   read, then *uaddr1 fetched from userland and
+		   after acquiring lock nqueued field compared with
+		   the stored value.  The smp_mb () below
+		   makes sure that bh1->nqueued is read from memory
+		   before *uaddr1.  */
+		smp_mb ();
+
+		if (get_user(curval, (int *)uaddr1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (curval != *valp) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
 	if (bh1 < bh2)
 		spin_lock(&bh1->lock);
 	spin_lock(&bh2->lock);
 	if (bh1 > bh2)
 		spin_lock(&bh1->lock);
 
+	if (unlikely (nqueued != bh1->nqueued && valp != NULL)) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
 	head1 = &bh1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
@@ -365,6 +396,7 @@ static int futex_requeue(unsigned long u
 		}
 	}
 
+out_unlock:
 	spin_unlock(&bh1->lock);
 	if (bh1 != bh2)
 		spin_unlock(&bh2->lock);
@@ -398,6 +430,7 @@ static void queue_me(struct futex_q *q, 
 	q->lock_ptr = &bh->lock;
 
 	spin_lock(&bh->lock);
+	bh->nqueued++;
 	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
@@ -625,7 +658,7 @@ out:
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2)
+		unsigned long uaddr2, int val2, int val3)
 {
 	int ret;
 
@@ -641,7 +674,10 @@ long do_futex(unsigned long uaddr, int o
 		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2);
+		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		break;
+	case FUTEX_CMP_REQUEUE:
+		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -651,7 +687,8 @@ long do_futex(unsigned long uaddr, int o
 
 
 asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
-			  struct timespec __user *utime, u32 __user *uaddr2)
+			  struct timespec __user *utime, u32 __user *uaddr2,
+			  int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -665,11 +702,11 @@ asmlinkage long sys_futex(u32 __user *ua
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 
 static struct super_block *
--- linux-2.6.5/kernel/compat.c.jj	2004-04-04 05:37:07.000000000 +0200
+++ linux-2.6.5/kernel/compat.c	2004-05-05 09:56:36.761119626 +0200
@@ -208,7 +208,7 @@ asmlinkage long compat_sys_sigprocmask(i
 
 #ifdef CONFIG_FUTEX
 asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
-		struct compat_timespec *utime, u32 *uaddr2)
+		struct compat_timespec *utime, u32 *uaddr2, int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -219,11 +219,11 @@ asmlinkage long compat_sys_futex(u32 *ua
 			return -EFAULT;
 		timeout = timespec_to_jiffies(&t) + 1;
 	}
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 #endif
 


	Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-22 16:58         ` Arnd Bergmann
@ 2004-05-24  7:34           ` Jakub Jelinek
  2004-05-24  8:12             ` Andrew Morton
  2004-05-24 17:34             ` Arnd Bergmann
  0 siblings, 2 replies; 20+ messages in thread
From: Jakub Jelinek @ 2004-05-24  7:34 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrew Morton, Ulrich Drepper, linux-kernel, mingo,
	Martin Schwidefsky

On Sat, May 22, 2004 at 06:58:41PM +0200, Arnd Bergmann wrote:
> On Friday 21 May 2004 09:43, Jakub Jelinek wrote:
> > Well, for s390/s390x there is a problem that it doesn't allow (yet) 6
> > argument syscalls at all, so one possibility for s390* is adding a wrapper around
> > sys_futex which will take the 5th and 6th arguments for FUTEX_CMP_REQUEUE
> > from a structure pointed to by 5th argument and pass that to sys_futex.
> 
> I would really prefer not re-inventing brain-damage like ipc_kludge. I
> have tried to do a special s390_futex_cmp_requeue() syscall, which is
> still somewhat ugly. At least it has the advantage that it does not break
> the de-facto calling conventions for s390 syscalls that either pass
> up to five arguments in r2 to r6 or everything in an array pointed to
> by r2.
> 
> Martin and Jakub, does the patch below look ok to you or should we do
> it in yet another way?

My personal preference is:
1) change s390* entry*.S so that all syscalls can use up to 6 arguments,
   otherwise we'll have the same problem every now and then
   (last syscall before this one was fadvise64_64 if I remember well)
2) allow sys_futex to have 6 arguments (the 6th on the stack, get_user'ed
   from an s390 wrapper)
3) special structure passed in 5th argument for FUTEX_CMP_REQUEUE
4) your solution
   (you have a special wrapper around futex anyway, so why not to handle
   it there already and create completely new syscall).

That said, NPTL can deal with any of these variants and the decision
is up to Martin I think (assuming the base patch gets accepted, that is).
http://listman.redhat.com/archives/phil-list/2004-May/msg00031.html
is the latest version of the userland part of FUTEX_CMP_REQUEUE changes
(against CVS glibc) and there futex_requeue () macro is defined
in arch specific headers, so nptl/sysdeps/unix/sysv/linux/s390/lowlevellock.h
can do there its own black magic.

	Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-24  7:34           ` Jakub Jelinek
@ 2004-05-24  8:12             ` Andrew Morton
  2004-05-24  8:19               ` Jakub Jelinek
  2004-05-24  8:27               ` bert hubert
  2004-05-24 17:34             ` Arnd Bergmann
  1 sibling, 2 replies; 20+ messages in thread
From: Andrew Morton @ 2004-05-24  8:12 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: arnd, drepper, linux-kernel, mingo, schwidefsky, bert hubert

Jakub Jelinek <jakub@redhat.com> wrote:
>
> That said, NPTL can deal with any of these variants and the decision
>  is up to Martin I think (assuming the base patch gets accepted, that is).

Well the race is real and does need a kernel fix, so I queued it up.  I
guess if the new argument to sys_futex() breaks some architecture they can
independently add a new syscall for it, or send a fix.  Mutter.

It's a bit of a shame that you need to be a rocket scientist to 
understand the futex syscall interface.  Bert, are you still maintaining
the manpage?  If so, is there enough info here to update it?




FUTEX_REQUEUE operation has been added to the kernel mainly to improve
pthread_cond_broadcast which previously used FUTEX_WAKE INT_MAX op. 
pthread_cond_broadcast releases internal condvar mutex before FUTEX_REQUEUE
operation, as otherwise the woken up thread most likely immediately sleeps
again on the internal condvar mutex until the broadcasting thread releases it.

Unfortunately this is racy and causes e.g. 
http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/nptl/tst-cond16.c?rev=1.1&content-type=text/x-cvsweb-markup&cvsroot=glibc
to hang on SMP.

http://listman.redhat.com/archives/phil-list/2004-May/msg00023.html contains
analysis how the hang happens, the problem is if any thread does
pthread_cond_*wait in between releasing of the internal condvar mutex and
FUTEX_REQUEUE operation, a wrong thread might be awaken (and immediately go to
sleep again because it doesn't satisfy conditions for returning from
pthread_cond_*wait) while the right thread requeued on the associated mutex
and there would be nobody to wake that thread up.

The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT
already uses:

FUTEX_CMP_REQUEUE is passed an additional argument which is the expected value
of *futex.  Kernel then while holding the futex locks checks if *futex !=
expected and returns -EAGAIN in that case, while if it is equal, continues
with a normal FUTEX_REQUEUE operation.  If the syscall returns -EAGAIN, NPTL
can fall back to FUTEX_WAKE INT_MAX operation which doesn't have this problem,
but is less efficient, while in the likely case that nobody hit the (small)
window the efficient FUTEX_REQUEUE operation is used.


---

 25-akpm/include/linux/futex.h |    5 ++--
 25-akpm/kernel/compat.c       |    6 ++---
 25-akpm/kernel/futex.c        |   49 ++++++++++++++++++++++++++++++++++++------
 3 files changed, 49 insertions(+), 11 deletions(-)

diff -puN include/linux/futex.h~add-futex_cmp_requeue-futex-op include/linux/futex.h
--- 25/include/linux/futex.h~add-futex_cmp_requeue-futex-op	2004-05-23 11:15:27.460533968 -0700
+++ 25-akpm/include/linux/futex.h	2004-05-23 11:15:27.467532904 -0700
@@ -8,9 +8,10 @@
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
 #define FUTEX_REQUEUE (3)
-
+#define FUTEX_CMP_REQUEUE (4)
 
 long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2);
+		unsigned long timeout, unsigned long uaddr2, int val2,
+		int val3);
 
 #endif
diff -puN kernel/compat.c~add-futex_cmp_requeue-futex-op kernel/compat.c
--- 25/kernel/compat.c~add-futex_cmp_requeue-futex-op	2004-05-23 11:15:27.462533664 -0700
+++ 25-akpm/kernel/compat.c	2004-05-23 11:15:27.470532448 -0700
@@ -208,7 +208,7 @@ asmlinkage long compat_sys_sigprocmask(i
 
 #ifdef CONFIG_FUTEX
 asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
-		struct compat_timespec *utime, u32 *uaddr2)
+		struct compat_timespec *utime, u32 *uaddr2, int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -219,11 +219,11 @@ asmlinkage long compat_sys_futex(u32 *ua
 			return -EFAULT;
 		timeout = timespec_to_jiffies(&t) + 1;
 	}
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 #endif
 
diff -puN kernel/futex.c~add-futex_cmp_requeue-futex-op kernel/futex.c
--- 25/kernel/futex.c~add-futex_cmp_requeue-futex-op	2004-05-23 11:15:27.463533512 -0700
+++ 25-akpm/kernel/futex.c	2004-05-23 11:15:27.469532600 -0700
@@ -96,6 +96,7 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
+       unsigned int	    nqueued;
        struct list_head       chain;
 };
 
@@ -318,13 +319,14 @@ out:
  * physical page.
  */
 static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-				int nr_wake, int nr_requeue)
+			 int nr_wake, int nr_requeue, int *valp)
 {
 	union futex_key key1, key2;
 	struct futex_hash_bucket *bh1, *bh2;
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
+	unsigned int nqueued;
 
 	down_read(&current->mm->mmap_sem);
 
@@ -338,12 +340,41 @@ static int futex_requeue(unsigned long u
 	bh1 = hash_futex(&key1);
 	bh2 = hash_futex(&key2);
 
+	nqueued = bh1->nqueued;
+	if (likely(valp != NULL)) {
+		int curval;
+
+		/* In order to avoid doing get_user while
+		   holding bh1->lock and bh2->lock, nqueued
+		   (monotonically increasing field) must be first
+		   read, then *uaddr1 fetched from userland and
+		   after acquiring lock nqueued field compared with
+		   the stored value.  The smp_mb () below
+		   makes sure that bh1->nqueued is read from memory
+		   before *uaddr1.  */
+		smp_mb();
+
+		if (get_user(curval, (int *)uaddr1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (curval != *valp) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
 	if (bh1 < bh2)
 		spin_lock(&bh1->lock);
 	spin_lock(&bh2->lock);
 	if (bh1 > bh2)
 		spin_lock(&bh1->lock);
 
+	if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
 	head1 = &bh1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
@@ -365,6 +396,7 @@ static int futex_requeue(unsigned long u
 		}
 	}
 
+out_unlock:
 	spin_unlock(&bh1->lock);
 	if (bh1 != bh2)
 		spin_unlock(&bh2->lock);
@@ -398,6 +430,7 @@ static void queue_me(struct futex_q *q, 
 	q->lock_ptr = &bh->lock;
 
 	spin_lock(&bh->lock);
+	bh->nqueued++;
 	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
@@ -625,7 +658,7 @@ out:
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2)
+		unsigned long uaddr2, int val2, int val3)
 {
 	int ret;
 
@@ -641,7 +674,10 @@ long do_futex(unsigned long uaddr, int o
 		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2);
+		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		break;
+	case FUTEX_CMP_REQUEUE:
+		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -651,7 +687,8 @@ long do_futex(unsigned long uaddr, int o
 
 
 asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
-			  struct timespec __user *utime, u32 __user *uaddr2)
+			  struct timespec __user *utime, u32 __user *uaddr2,
+			  int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -665,11 +702,11 @@ asmlinkage long sys_futex(u32 __user *ua
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 
 static struct super_block *

_


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-24  8:12             ` Andrew Morton
@ 2004-05-24  8:19               ` Jakub Jelinek
  2004-05-28 13:09                 ` DOCUMENTATION " bert hubert
  2004-05-24  8:27               ` bert hubert
  1 sibling, 1 reply; 20+ messages in thread
From: Jakub Jelinek @ 2004-05-24  8:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: arnd, drepper, linux-kernel, mingo, schwidefsky, bert hubert

On Mon, May 24, 2004 at 01:12:03AM -0700, Andrew Morton wrote:
> Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > That said, NPTL can deal with any of these variants and the decision
> >  is up to Martin I think (assuming the base patch gets accepted, that is).
> 
> Well the race is real and does need a kernel fix, so I queued it up.  I
> guess if the new argument to sys_futex() breaks some architecture they can
> independently add a new syscall for it, or send a fix.  Mutter.
> 
> It's a bit of a shame that you need to be a rocket scientist to 
> understand the futex syscall interface.  Bert, are you still maintaining
> the manpage?  If so, is there enough info here to update it?

The latest futex(2) or futex(4) manpage I saw doesn't mention FUTEX_REQUEUE
at all.
Also, any futex man page should probably SEE ALSO Ulrich's futex paper:
http://people.redhat.com/drepper/futex.pdf
which helps understanding how to successfully use futexes, because
it is certainly not trivial.

	Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-24  8:12             ` Andrew Morton
  2004-05-24  8:19               ` Jakub Jelinek
@ 2004-05-24  8:27               ` bert hubert
  1 sibling, 0 replies; 20+ messages in thread
From: bert hubert @ 2004-05-24  8:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jakub Jelinek, arnd, drepper, linux-kernel, mingo, schwidefsky

On Mon, May 24, 2004 at 01:12:03AM -0700, Andrew Morton wrote:
> Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > That said, NPTL can deal with any of these variants and the decision
> >  is up to Martin I think (assuming the base patch gets accepted, that is).
> 
> Well the race is real and does need a kernel fix, so I queued it up.  I
> guess if the new argument to sys_futex() breaks some architecture they can
> independently add a new syscall for it, or send a fix.  Mutter.
> 
> It's a bit of a shame that you need to be a rocket scientist to 
> understand the futex syscall interface.  Bert, are you still maintaining
> the manpage?  If so, is there enough info here to update it?

I'll spend some time with the manpage tomorrow. Some other comments have
been queued too and I'll process them. By that time, I'll hand over the
manpages to aeb.

Thanks for the heads up.

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-24  7:34           ` Jakub Jelinek
  2004-05-24  8:12             ` Andrew Morton
@ 2004-05-24 17:34             ` Arnd Bergmann
  1 sibling, 0 replies; 20+ messages in thread
From: Arnd Bergmann @ 2004-05-24 17:34 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andrew Morton, Ulrich Drepper, linux-kernel, mingo,
	Martin Schwidefsky

[-- Attachment #1: Type: text/plain, Size: 5806 bytes --]

On Monday 24 May 2004 09:34, Jakub Jelinek wrote:
> My personal preference is:
> 1) change s390* entry*.S so that all syscalls can use up to 6 arguments,
>    otherwise we'll have the same problem every now and then
>    (last syscall before this one was fadvise64_64 if I remember well)
> 2) allow sys_futex to have 6 arguments (the 6th on the stack, get_user'ed
>    from an s390 wrapper)
> 3) special structure passed in 5th argument for FUTEX_CMP_REQUEUE
> 4) your solution
>    (you have a special wrapper around futex anyway, so why not to handle
>    it there already and create completely new syscall).

I have talked to Martin about it (he won't be online for one more week),
and he proposed a variant of 2): sys_futex, and possibly further new
syscalls that might get added, pass their sixth argument in %r7,
which is easier to get by than the user stack. Unlike doing it in
the standard system call path, this won't cause any overhead for
the other syscalls, but we need some wrappers.

I haven't tested this, because I'm not too familiar with glibc
changes. Jakub, if you are happy with this approach, can you
add the respective glibc stuff and try if this is any good?

	Arnd <><

[PATCH] Fix FUTEX_CMP_REQUEUE on s390

	From: Arnd Bergmann <arnd@arndb.de>

System calls normally can not have more than five arguments on s390
because of ABI restrictions, but the addition of the FUTEX_CMP_REQUEUE
added a sixth argument to sys_futex. This patch extends the regular
syscall conventions by an extra argument register for sys_futex.

The regular method of passing all arguments on the user stack for six
argument syscalls on s390 does not work here because it would break
the existing usage of futex.

I'm also fixing the sys_futex prototype in <linux/syscalls.h> to make
it possible to call it from s390_futex.

===== arch/s390/kernel/compat_linux.c 1.21 vs edited =====
--- 1.21/arch/s390/kernel/compat_linux.c	Sat May 22 06:31:44 2004
+++ edited/arch/s390/kernel/compat_linux.c	Mon May 24 18:51:26 2004
@@ -1263,3 +1263,16 @@
 
 	return ret;
 }
+
+asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
+                struct compat_timespec *utime, u32 *uaddr2, int val3);
+
+asmlinkage long
+sys32_futex(u32 __user *uaddr, int op, int val,
+	struct compat_timespec __user *utime, u32 __user *uaddr2)
+{
+	struct pt_regs *regs;
+
+	regs = __KSTK_PTREGS(current);
+	return compat_sys_futex(uaddr, op, val, utime, uaddr2, regs->gprs[7]);
+}
===== arch/s390/kernel/compat_wrapper.S 1.13 vs edited =====
--- 1.13/arch/s390/kernel/compat_wrapper.S	Sat May 22 06:31:44 2004
+++ edited/arch/s390/kernel/compat_wrapper.S	Mon May 24 19:04:15 2004
@@ -1090,14 +1090,14 @@
 	llgtr	%r3,%r3			# struct stat64 *
 	jg	sys32_fstat64		# branch to system call
 
-	.globl  compat_sys_futex_wrapper 
-compat_sys_futex_wrapper:
+	.globl  sys32_futex_wrapper 
+sys32_futex_wrapper:
 	llgtr	%r2,%r2			# u32 *
 	lgfr	%r3,%r3			# int
 	lgfr	%r4,%r4			# int
 	llgtr	%r5,%r5			# struct compat_timespec *
 	llgtr	%r6,%r6			# u32 *
-	jg	compat_sys_futex	# branch to system call
+	jg	sys32_futex		# branch to system call
 
 	.globl	sys32_setxattr_wrapper
 sys32_setxattr_wrapper:
===== arch/s390/kernel/sys_s390.c 1.14 vs edited =====
--- 1.14/arch/s390/kernel/sys_s390.c	Sat May 22 06:31:44 2004
+++ edited/arch/s390/kernel/sys_s390.c	Mon May 24 18:51:26 2004
@@ -265,3 +265,12 @@
 	return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
 }
 
+asmlinkage long
+s390_futex(u32 __user *uaddr, int op, int val,
+	struct timespec __user *utime, u32 __user *uaddr2)
+{
+	struct pt_regs *regs;
+
+	regs = __KSTK_PTREGS(current);
+	return sys_futex(uaddr, op, val, utime, uaddr2, regs->gprs[7]);
+}
===== arch/s390/kernel/syscalls.S 1.12 vs edited =====
--- 1.12/arch/s390/kernel/syscalls.S	Sat May 22 06:34:19 2004
+++ edited/arch/s390/kernel/syscalls.S	Mon May 24 19:03:28 2004
@@ -246,7 +246,7 @@
 SYSCALL(sys_fremovexattr,sys_fremovexattr,sys32_fremovexattr_wrapper)	/* 235 */
 SYSCALL(sys_gettid,sys_gettid,sys_gettid)
 SYSCALL(sys_tkill,sys_tkill,sys_tkill)
-SYSCALL(sys_futex,sys_futex,compat_sys_futex_wrapper)
+SYSCALL(s390_futex,s390_futex,sys32_futex_wrapper)
 SYSCALL(sys_sched_setaffinity,sys_sched_setaffinity,sys32_sched_setaffinity_wrapper)
 SYSCALL(sys_sched_getaffinity,sys_sched_getaffinity,sys32_sched_getaffinity_wrapper)	/* 240 */
 SYSCALL(sys_tgkill,sys_tgkill,sys_tgkill)
===== include/linux/syscalls.h 1.6 vs edited =====
--- 1.6/include/linux/syscalls.h	Sat May 22 06:31:47 2004
+++ edited/include/linux/syscalls.h	Mon May 24 18:51:26 2004
@@ -165,7 +165,7 @@
 asmlinkage long sys_waitpid(pid_t pid, unsigned int __user *stat_addr, int options);
 asmlinkage long sys_set_tid_address(int __user *tidptr);
 asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
-			struct timespec __user *utime, u32 __user *uaddr2);
+			struct timespec __user *utime, u32 __user *uaddr2, int val3);
 
 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 				const char __user *uargs);
===== kernel/fork.c 1.169 vs edited =====
--- 1.169/kernel/fork.c	Sat May 22 06:32:19 2004
+++ edited/kernel/fork.c	Mon May 24 18:51:26 2004
@@ -516,7 +516,7 @@
 		 * not set up a proper pointer then tough luck.
 		 */
 		put_user(0, tidptr);
-		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
+		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
 	}
 }
 
===== kernel/futex.c 1.40 vs edited =====
--- 1.40/kernel/futex.c	Sat May 22 09:03:35 2004
+++ edited/kernel/futex.c	Mon May 24 18:51:26 2004
@@ -38,6 +38,7 @@
 #include <linux/futex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
 
 #define FUTEX_HASHBITS 8
 

[-- Attachment #2: signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* DOCUMENTATION Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-24  8:19               ` Jakub Jelinek
@ 2004-05-28 13:09                 ` bert hubert
  2004-05-28 14:02                   ` Jakub Jelinek
  0 siblings, 1 reply; 20+ messages in thread
From: bert hubert @ 2004-05-28 13:09 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andrew Morton, arnd, drepper, linux-kernel, mingo, schwidefsky

> > It's a bit of a shame that you need to be a rocket scientist to 
> > understand the futex syscall interface.  Bert, are you still maintaining
> > the manpage?  If so, is there enough info here to update it?
> 
> The latest futex(2) or futex(4) manpage I saw doesn't mention FUTEX_REQUEUE
> at all.

Now fixed, please see http://ds9a.nl/futex-manpages - but please realise I'm
somewhat out of my depth. Comments welcome.

Futexes have mutated into complicated things, I wonder if this was the last
of the changes needed.

The big change in the manpages is the addition of FUTEX_REQUEUE and
FUTEX_CMP_REQUEUE. Furthermore, I realised that the futex system call does
not return EAGAIN etc, it returns -EAGAIN. I guesstimated that CMP_REQUEUE
will be merged before 2.6.7.

To clarify the overloaded situation wrt the futex syscall, I split it up in
three prototypes. 

Ulrich, does/will glibc provide a futex(2) function? Or should people just
call the syscall themselves? 

There were also some complaints I did not address futexfs but as far as I
can see, it is a kernel internal matter of no interest to userspace coders?

> Also, any futex man page should probably SEE ALSO Ulrich's futex paper:
> http://people.redhat.com/drepper/futex.pdf

Referenced, thanks!

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: DOCUMENTATION Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-28 13:09                 ` DOCUMENTATION " bert hubert
@ 2004-05-28 14:02                   ` Jakub Jelinek
  2004-05-28 15:39                     ` bert hubert
  0 siblings, 1 reply; 20+ messages in thread
From: Jakub Jelinek @ 2004-05-28 14:02 UTC (permalink / raw)
  To: bert hubert, Andrew Morton, arnd, drepper, linux-kernel, mingo,
	schwidefsky

On Fri, May 28, 2004 at 03:09:35PM +0200, bert hubert wrote:
> > > It's a bit of a shame that you need to be a rocket scientist to 
> > > understand the futex syscall interface.  Bert, are you still maintaining
> > > the manpage?  If so, is there enough info here to update it?
> > 
> > The latest futex(2) or futex(4) manpage I saw doesn't mention FUTEX_REQUEUE
> > at all.
> 
> Now fixed, please see http://ds9a.nl/futex-manpages - but please realise I'm
> somewhat out of my depth. Comments welcome.
> 
> Futexes have mutated into complicated things, I wonder if this was the last
> of the changes needed.
> 
> The big change in the manpages is the addition of FUTEX_REQUEUE and
> FUTEX_CMP_REQUEUE. Furthermore, I realised that the futex system call does
> not return EAGAIN etc, it returns -EAGAIN. I guesstimated that CMP_REQUEUE

futex behaves like most of the other syscalls, i.e. on error returns -1
and sets errno to the error value, like EAGAIN, EFAULT etc.
This is done in a userland wrapper around the syscall, how the kernel tells
userland an error happened and what errno value should be set is an
architecture specific implementation detail.
The man pages in the 2nd section document the behaviour of the userland
wrappers (see e.g. read(2)), not what the kernel actually returns.

> Ulrich, does/will glibc provide a futex(2) function? Or should people just
> call the syscall themselves? 

glibc doesn't provide a futex(2) function, so at least ATM people can use
#include <sys/syscall.h>
syscall (SYS_futex, &futex, FUTEX_xyz, ...) or come up with their own
assembly to invoke the syscall.

	Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: DOCUMENTATION Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-28 14:02                   ` Jakub Jelinek
@ 2004-05-28 15:39                     ` bert hubert
  0 siblings, 0 replies; 20+ messages in thread
From: bert hubert @ 2004-05-28 15:39 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: arnd, drepper, jamie, linux-kernel, mingo, schwidefsky

On Fri, May 28, 2004 at 10:02:34AM -0400, Jakub Jelinek wrote:

> The man pages in the 2nd section document the behaviour of the userland
> wrappers (see e.g. read(2)), not what the kernel actually returns.

Ok - but in the absence of futex(2), is it perhaps better to move this page
to futex(9)? Or merge it into futex(4)?

> > Ulrich, does/will glibc provide a futex(2) function? Or should people just
> > call the syscall themselves? 
> 
> glibc doesn't provide a futex(2) function, so at least ATM people can use
> #include <sys/syscall.h>

If it will provide one, I'll change the page to be in line with how futex(2)
will probably work, with errno etc.

Regarding the verbiage, I need input about the use of the word 'process', as
I'm not sure if there is a generic word covering both processes and threads.
Jamie suggests 'tasks', but that is kernel lingo and of little use for
userspace developers. Suggestions?

Thanks.

bert

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
  2004-05-20  9:38 [PATCH] Add FUTEX_CMP_REQUEUE futex op Jakub Jelinek
  2004-05-20 22:52 ` Andrew Morton
  2004-05-21 23:05 ` Andrew Morton
@ 2004-05-29  3:13 ` Rusty Russell
  2 siblings, 0 replies; 20+ messages in thread
From: Rusty Russell @ 2004-05-29  3:13 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: lkml - Kernel Mailing List, mingo, Andrew Morton

On Thu, 2004-05-20 at 19:38, Jakub Jelinek wrote:
> Hi!
> 
> FUTEX_REQUEUE operation has been added to the kernel mainly to improve
> pthread_cond_broadcast which previously used FUTEX_WAKE INT_MAX op.

For the record, I wasn't happy about adding FUTEX_REQUEUE to optimize
for suboptimal apps using the horrid pthreads interface, but I
understand the benchmarking realities.

I'm certainly way less than thrilled to discover that it doesn't work,
and we need to implement FUTEX_CMP_REQUEUE, and of course can't get rid
of FUTEX_REQUEUE.

The base futex concept and interface is simple (although the
implementation w/ the Linux mm subsystem proved interesting in the
corner cases); it's increasingly becoming a horror with these kind of
hacks.

8(
Rusty.
-- 
Anyone who quotes me in their signature is an idiot -- Rusty Russell


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2004-05-29  3:13 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-05-20  9:38 [PATCH] Add FUTEX_CMP_REQUEUE futex op Jakub Jelinek
2004-05-20 22:52 ` Andrew Morton
2004-05-21  6:06   ` Ulrich Drepper
2004-05-21  6:36     ` Andrew Morton
2004-05-21  7:15       ` Ulrich Drepper
2004-05-21  7:43       ` Jakub Jelinek
2004-05-22 16:58         ` Arnd Bergmann
2004-05-24  7:34           ` Jakub Jelinek
2004-05-24  8:12             ` Andrew Morton
2004-05-24  8:19               ` Jakub Jelinek
2004-05-28 13:09                 ` DOCUMENTATION " bert hubert
2004-05-28 14:02                   ` Jakub Jelinek
2004-05-28 15:39                     ` bert hubert
2004-05-24  8:27               ` bert hubert
2004-05-24 17:34             ` Arnd Bergmann
2004-05-21  7:05   ` Ingo Molnar
2004-05-21 23:05 ` Andrew Morton
2004-05-22 10:10   ` Ingo Oeser
2004-05-23 17:33   ` Jakub Jelinek
2004-05-29  3:13 ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox