public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Add FUTEX_CMP_REQUEUE futex op
@ 2004-05-20  9:38 Jakub Jelinek
  2004-05-20 22:52 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Jakub Jelinek @ 2004-05-20  9:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: mingo

Hi!

FUTEX_REQUEUE operation has been added to the kernel mainly to improve
pthread_cond_broadcast which previously used FUTEX_WAKE INT_MAX op.
pthread_cond_broadcast releases internal condvar mutex before FUTEX_REQUEUE
operation, as otherwise the woken up thread most likely immediately sleeps
again on the internal condvar mutex until the broadcasting thread releases
it.
Unfortunately this is racy and causes e.g.
http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/nptl/tst-cond16.c?rev=1.1&content-type=text/x-cvsweb-markup&cvsroot=glibc
to hang on SMP.
http://listman.redhat.com/archives/phil-list/2004-May/msg00023.html
contains analysis how the hang happens, the problem is if any thread
does pthread_cond_*wait in between releasing of the internal condvar
mutex and FUTEX_REQUEUE operation, a wrong thread might be awaken (and
immediately go to sleep again because it doesn't satisfy conditions for
returning from pthread_cond_*wait) while the right thread requeued on
the associated mutex and there would be nobody to wake that thread up.

The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT
already uses:
FUTEX_CMP_REQUEUE is passed an additional argument which is the expected
value of *futex.  Kernel then while holding the futex locks checks if
*futex != expected and returns -EAGAIN in that case, while if it is equal,
continues with a normal FUTEX_REQUEUE operation.
If the syscall returns -EAGAIN, NPTL can fall back to FUTEX_WAKE INT_MAX
operation which doesn't have this problem, but is less efficient, while
in the likely case that nobody hit the (small) window the efficient
FUTEX_REQUEUE operation is used.

--- linux-2.6.5/include/linux/futex.h.jj	2004-04-04 05:37:36.000000000 +0200
+++ linux-2.6.5/include/linux/futex.h	2004-05-05 09:57:09.200306101 +0200
@@ -8,9 +8,10 @@
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
 #define FUTEX_REQUEUE (3)
-
+#define FUTEX_CMP_REQUEUE (4)
 
 long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2);
+		unsigned long timeout, unsigned long uaddr2, int val2,
+		int val3);
 
 #endif
--- linux-2.6.5/kernel/futex.c.jj	2004-04-04 05:36:52.000000000 +0200
+++ linux-2.6.5/kernel/futex.c	2004-05-05 12:23:33.481048623 +0200
@@ -96,6 +96,7 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
+       unsigned int	    nqueued;
        struct list_head       chain;
 };
 
@@ -318,13 +319,14 @@ out:
  * physical page.
  */
 static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-				int nr_wake, int nr_requeue)
+			 int nr_wake, int nr_requeue, int *valp)
 {
 	union futex_key key1, key2;
 	struct futex_hash_bucket *bh1, *bh2;
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
+	unsigned int nqueued;
 
 	down_read(&current->mm->mmap_sem);
 
@@ -338,12 +340,33 @@ static int futex_requeue(unsigned long u
 	bh1 = hash_futex(&key1);
 	bh2 = hash_futex(&key2);
 
+	nqueued = bh1->nqueued;
+	if (likely (valp != NULL)) {
+		int curval;
+
+		smp_mb ();
+
+		if (get_user(curval, (int *)uaddr1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (curval != *valp) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
 	if (bh1 < bh2)
 		spin_lock(&bh1->lock);
 	spin_lock(&bh2->lock);
 	if (bh1 > bh2)
 		spin_lock(&bh1->lock);
 
+	if (unlikely (nqueued != bh1->nqueued && valp != NULL)) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
 	head1 = &bh1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
@@ -365,6 +388,7 @@ static int futex_requeue(unsigned long u
 		}
 	}
 
+out_unlock:
 	spin_unlock(&bh1->lock);
 	if (bh1 != bh2)
 		spin_unlock(&bh2->lock);
@@ -398,6 +422,7 @@ static void queue_me(struct futex_q *q, 
 	q->lock_ptr = &bh->lock;
 
 	spin_lock(&bh->lock);
+	bh->nqueued++;
 	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
@@ -625,7 +650,7 @@ out:
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2)
+		unsigned long uaddr2, int val2, int val3)
 {
 	int ret;
 
@@ -641,7 +666,10 @@ long do_futex(unsigned long uaddr, int o
 		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2);
+		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		break;
+	case FUTEX_CMP_REQUEUE:
+		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -651,7 +679,8 @@ long do_futex(unsigned long uaddr, int o
 
 
 asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
-			  struct timespec __user *utime, u32 __user *uaddr2)
+			  struct timespec __user *utime, u32 __user *uaddr2,
+			  int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -665,11 +694,11 @@ asmlinkage long sys_futex(u32 __user *ua
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 
 static struct super_block *
--- linux-2.6.5/kernel/compat.c.jj	2004-04-04 05:37:07.000000000 +0200
+++ linux-2.6.5/kernel/compat.c	2004-05-05 09:56:36.761119626 +0200
@@ -208,7 +208,7 @@ asmlinkage long compat_sys_sigprocmask(i
 
 #ifdef CONFIG_FUTEX
 asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
-		struct compat_timespec *utime, u32 *uaddr2)
+		struct compat_timespec *utime, u32 *uaddr2, int val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -219,11 +219,11 @@ asmlinkage long compat_sys_futex(u32 *ua
 			return -EFAULT;
 		timeout = timespec_to_jiffies(&t) + 1;
 	}
-	if (op == FUTEX_REQUEUE)
+	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2);
+			(unsigned long)uaddr2, val2, val3);
 }
 #endif
 

	Jakub

^ permalink raw reply	[flat|nested] 24+ messages in thread
* Re: [PATCH] Add FUTEX_CMP_REQUEUE futex op
@ 2004-06-07 16:03 Martin Schwidefsky
  0 siblings, 0 replies; 24+ messages in thread
From: Martin Schwidefsky @ 2004-06-07 16:03 UTC (permalink / raw)
  To: arnd; +Cc: jakub, akpm, drepper, linux-kernel, mingo

> > My personal preference is:
> > 1) change s390* entry*.S so that all syscalls can use up to 6 arguments,
> >    otherwise we'll have the same problem every now and then
> >    (last syscall before this one was fadvise64_64 if I remember well)
> > 2) allow sys_futex to have 6 arguments (the 6th on the stack, get_user'ed
> >    from an s390 wrapper)
> > 3) special structure passed in 5th argument for FUTEX_CMP_REQUEUE
> > 4) your solution
> >    (you have a special wrapper around futex anyway, so why not to handle
> >    it there already and create completely new syscall).
> 
> I have talked to Martin about it (he won't be online for one more week),
> and he proposed a variant of 2): sys_futex, and possibly further new
> syscalls that might get added, pass their sixth argument in %r7,
> which is easier to get by than the user stack. Unlike doing it in
> the standard system call path, this won't cause any overhead for
> the other syscalls, but we need some wrappers.

I prefer to store the additional argument in %r7 for ALL system calls
to avoid addtitional system call wrappers. That way new system calls
with 6 arguments can be added without problems. The store for the
additional 6th argument can be hidden in a agi slot so this is pretty
much the perfect solution.

blue skies,
  Martin.

---

[PATCH] s390: add support for 6 system call arguments (FUTEX_CMP_REQUEUE).

This patch adds support for 6 system call arguments on s390. The
first exploiter of this will be the sys_futex system call for the
FUTEX_CMP_REQUEUE operation. The idea is simple: use register %r7
for the 6th argument. This can be extended to 7/8/9/... arguments
if there ever will be the need for it. To call the system call
function in the kernel the additional arguments needs to get stored
on the stack. 8 bytes are added to the head of struct pt_regs. %r7
is stored to the additional field for all system calls. The store
is hidden in a address-generation-interlock slot, it doesn't slow
down the system call path.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diffstat:
 arch/s390/kernel/asm-offsets.c    |    1 +
 arch/s390/kernel/compat_wrapper.S |    2 ++
 arch/s390/kernel/entry.S          |   10 +++++++---
 arch/s390/kernel/entry64.S        |   10 +++++++---
 arch/s390/kernel/ptrace.c         |   10 +++++-----
 include/asm-s390/ptrace.h         |    1 +
 6 files changed, 23 insertions(+), 11 deletions(-)

diff -urN linux-2.6/arch/s390/kernel/asm-offsets.c linux-2.6-s390/arch/s390/kernel/asm-offsets.c
--- linux-2.6/arch/s390/kernel/asm-offsets.c	Mon May 10 04:32:26 2004
+++ linux-2.6-s390/arch/s390/kernel/asm-offsets.c	Mon Jun  7 16:10:03 2004
@@ -32,6 +32,7 @@
 	DEFINE(__TI_cpu, offsetof(struct thread_info, cpu),);
 	DEFINE(__TI_precount, offsetof(struct thread_info, preempt_count),);
 	BLANK();
+	DEFINE(__PT_ARGS, offsetof(struct pt_regs, args),);
 	DEFINE(__PT_PSW, offsetof(struct pt_regs, psw),);
 	DEFINE(__PT_GPRS, offsetof(struct pt_regs, gprs),);
 	DEFINE(__PT_ORIG_GPR2, offsetof(struct pt_regs, orig_gpr2),);
diff -urN linux-2.6/arch/s390/kernel/compat_wrapper.S linux-2.6-s390/arch/s390/kernel/compat_wrapper.S
--- linux-2.6/arch/s390/kernel/compat_wrapper.S	Mon Jun  7 16:07:24 2004
+++ linux-2.6-s390/arch/s390/kernel/compat_wrapper.S	Mon Jun  7 16:07:53 2004
@@ -1097,6 +1097,8 @@
 	lgfr	%r4,%r4			# int
 	llgtr	%r5,%r5			# struct compat_timespec *
 	llgtr	%r6,%r6			# u32 *
+	lgf	%r0,164(%r15)		# int
+	stg	%r0,160(%r15)
 	jg	compat_sys_futex	# branch to system call
 
 	.globl	sys32_setxattr_wrapper
diff -urN linux-2.6/arch/s390/kernel/entry.S linux-2.6-s390/arch/s390/kernel/entry.S
--- linux-2.6/arch/s390/kernel/entry.S	Mon Jun  7 16:07:24 2004
+++ linux-2.6-s390/arch/s390/kernel/entry.S	Mon Jun  7 16:11:55 2004
@@ -24,7 +24,8 @@
  * Stack layout for the system_call stack entry.
  * The first few entries are identical to the user_regs_struct.
  */
-SP_PTREGS    =  STACK_FRAME_OVERHEAD 
+SP_PTREGS    =  STACK_FRAME_OVERHEAD
+SP_ARGS      =  STACK_FRAME_OVERHEAD + __PT_ARGS
 SP_PSW       =  STACK_FRAME_OVERHEAD + __PT_PSW
 SP_R0        =  STACK_FRAME_OVERHEAD + __PT_GPRS
 SP_R1        =  STACK_FRAME_OVERHEAD + __PT_GPRS + 4
@@ -230,12 +231,14 @@
 sysc_enter:
         GET_THREAD_INFO           # load pointer to task_struct to R9
 	sla	%r7,2             # *4 and test for svc 0
-	bnz	BASED(sysc_do_restart)  # svc number > 0
+	bnz	BASED(sysc_nr_ok) # svc number > 0
 	# svc 0: system call number in %r1
 	cl	%r1,BASED(.Lnr_syscalls)
-	bnl	BASED(sysc_do_restart)
+	bnl	BASED(sysc_nr_ok)
 	lr	%r7,%r1           # copy svc number to %r7
 	sla	%r7,2             # *4
+sysc_nr_ok:
+	mvc	SP_ARGS(4,%r15),SP_R7(%r15)
 sysc_do_restart:
 	tm	__TI_flags+3(%r9),(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
         l       %r8,sys_call_table-system_call(%r7,%r13) # get system call addr.
@@ -510,6 +513,7 @@
 	lr	%r7,%r1           # copy svc number to %r7
 	sla	%r7,2             # *4
 pgm_svcstd:
+	mvc	SP_ARGS(4,%r15),SP_R7(%r15)
 	tm	__TI_flags+3(%r9),(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
         l       %r8,sys_call_table-system_call(%r7,%r13) # get system call addr.
         bnz     BASED(pgm_tracesys)
diff -urN linux-2.6/arch/s390/kernel/entry64.S linux-2.6-s390/arch/s390/kernel/entry64.S
--- linux-2.6/arch/s390/kernel/entry64.S	Mon Jun  7 16:07:24 2004
+++ linux-2.6-s390/arch/s390/kernel/entry64.S	Mon Jun  7 16:13:17 2004
@@ -24,7 +24,8 @@
  * Stack layout for the system_call stack entry.
  * The first few entries are identical to the user_regs_struct.
  */
-SP_PTREGS    =  STACK_FRAME_OVERHEAD 
+SP_PTREGS    =  STACK_FRAME_OVERHEAD
+SP_ARGS      =  STACK_FRAME_OVERHEAD + __PT_ARGS
 SP_PSW       =  STACK_FRAME_OVERHEAD + __PT_PSW
 SP_R0        =  STACK_FRAME_OVERHEAD + __PT_GPRS
 SP_R1        =  STACK_FRAME_OVERHEAD + __PT_GPRS + 8
@@ -214,13 +215,15 @@
 sysc_enter:
         GET_THREAD_INFO           # load pointer to task_struct to R9
         slag    %r7,%r7,2         # *4 and test for svc 0
-	jnz	sysc_do_restart
+	jnz	sysc_nr_ok
 	# svc 0: system call number in %r1
 	lghi	%r0,NR_syscalls
 	clr	%r1,%r0
-	jnl	sysc_do_restart
+	jnl	sysc_nr_ok
 	lgfr	%r7,%r1           # clear high word in r1
 	slag    %r7,%r7,2         # svc 0: system call number in %r1
+sysc_nr_ok:
+	mvc	SP_ARGS(8,%r15),SP_R7(%r15)
 sysc_do_restart:
 	larl    %r10,sys_call_table
 #ifdef CONFIG_S390_SUPPORT
@@ -542,6 +545,7 @@
 	clr	%r1,%r0
 	slag	%r7,%r1,2
 pgm_svcstd:
+	mvc	SP_ARGS(8,%r15),SP_R7(%r15)
 	larl    %r10,sys_call_table
 #ifdef CONFIG_S390_SUPPORT
         tm      SP_PSW+3(%r15),0x01  # are we running in 31 bit mode ?
diff -urN linux-2.6/arch/s390/kernel/ptrace.c linux-2.6-s390/arch/s390/kernel/ptrace.c
--- linux-2.6/arch/s390/kernel/ptrace.c	Mon May 10 04:33:19 2004
+++ linux-2.6-s390/arch/s390/kernel/ptrace.c	Mon Jun  7 16:07:53 2004
@@ -141,7 +141,7 @@
 		/*
 		 * psw and gprs are stored on the stack
 		 */
-		tmp = *(addr_t *)((addr_t) __KSTK_PTREGS(child) + addr);
+		tmp = *(addr_t *)((addr_t) &__KSTK_PTREGS(child)->psw + addr);
 		if (addr == (addr_t) &dummy->regs.psw.mask)
 			/* Remove per bit from user psw. */
 			tmp &= ~PSW_MASK_PER;
@@ -215,7 +215,7 @@
 			   high order bit but older gdb's rely on it */
 			data |= PSW_ADDR_AMODE;
 #endif
-		*(addr_t *)((addr_t) __KSTK_PTREGS(child) + addr) = data;
+		*(addr_t *)((addr_t) &__KSTK_PTREGS(child)->psw + addr) = data;
 
 	} else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) {
 		/*
@@ -360,7 +360,7 @@
 				PSW32_ADDR_AMODE31;
 		} else {
 			/* gpr 0-15 */
-			tmp = *(__u32 *)((addr_t) __KSTK_PTREGS(child) + 
+			tmp = *(__u32 *)((addr_t) &__KSTK_PTREGS(child)->psw +
 					 addr*2 + 4);
 		}
 	} else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
@@ -439,8 +439,8 @@
 				(__u64) tmp & PSW32_ADDR_INSN;
 		} else {
 			/* gpr 0-15 */
-			*(__u32*)((addr_t) __KSTK_PTREGS(child) + addr*2 + 4) =
-				tmp;
+			*(__u32*)((addr_t) &__KSTK_PTREGS(child)->psw
+				  + addr*2 + 4) = tmp;
 		}
 	} else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
 		/*
diff -urN linux-2.6/include/asm-s390/ptrace.h linux-2.6-s390/include/asm-s390/ptrace.h
--- linux-2.6/include/asm-s390/ptrace.h	Mon May 10 04:32:54 2004
+++ linux-2.6-s390/include/asm-s390/ptrace.h	Mon Jun  7 16:07:53 2004
@@ -303,6 +303,7 @@
  */
 struct pt_regs 
 {
+	unsigned long args[1];
 	psw_t psw;
 	unsigned long gprs[NUM_GPRS];
 	unsigned long orig_gpr2;

^ permalink raw reply	[flat|nested] 24+ messages in thread
[parent not found: <mailman.1086629984.12568.linux-kernel2news@redhat.com>]

end of thread, other threads:[~2004-06-11  8:35 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-05-20  9:38 [PATCH] Add FUTEX_CMP_REQUEUE futex op Jakub Jelinek
2004-05-20 22:52 ` Andrew Morton
2004-05-21  6:06   ` Ulrich Drepper
2004-05-21  6:36     ` Andrew Morton
2004-05-21  7:15       ` Ulrich Drepper
2004-05-21  7:43       ` Jakub Jelinek
2004-05-22 16:58         ` Arnd Bergmann
2004-05-24  7:34           ` Jakub Jelinek
2004-05-24  8:12             ` Andrew Morton
2004-05-24  8:19               ` Jakub Jelinek
2004-05-28 13:09                 ` DOCUMENTATION " bert hubert
2004-05-28 14:02                   ` Jakub Jelinek
2004-05-28 15:39                     ` bert hubert
2004-05-24  8:27               ` bert hubert
2004-05-24 17:34             ` Arnd Bergmann
2004-05-21  7:05   ` Ingo Molnar
2004-05-21 23:05 ` Andrew Morton
2004-05-22 10:10   ` Ingo Oeser
2004-05-23 17:33   ` Jakub Jelinek
2004-05-29  3:13 ` Rusty Russell
  -- strict thread matches above, loose matches on Subject: below --
2004-06-07 16:03 Martin Schwidefsky
     [not found] <mailman.1086629984.12568.linux-kernel2news@redhat.com>
2004-06-09 20:04 ` Pete Zaitcev
2004-06-09 22:08   ` Arnd Bergmann
2004-06-11  8:35   ` Martin Schwidefsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox