* [patch] 2.4.4 alpha semaphores optimization
@ 2001-05-03 15:47 Ivan Kokshaysky
2001-05-03 17:28 ` Andrea Arcangeli
` (3 more replies)
0 siblings, 4 replies; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-03 15:47 UTC (permalink / raw)
To: Richard Henderson; +Cc: linux-kernel
Initially I tried to use __builtin_expect in the rwsem.h, but found
that it doesn't help at all in the small inline functions - it works
as expected only in a reasonably large block of code. Converting these
functions into the macros won't help, as callers are inline
functions also.
On the other hand, gcc 3.0 generates quite a good code for
conditional branches (comparisons like value < 0, value == 0
predicted as false etc.). In the cases where expected value is 0,
we can use cmpeq instruction.
Other changes:
- added atomic_add_return_prev() for __down_write()
- removed some mb's for non-SMP
- removed non-inline up()/down_xx() when semaphore/waitqueue debugging
isn't enabled.
Ivan.
--- 2.4.4/include/asm-alpha/rwsem.h Sun Feb 7 06:28:16 2106
+++ linux/include/asm-alpha/rwsem.h Thu May 3 13:01:34 2001
@@ -0,0 +1,105 @@
+#ifndef _ALPHA_RWSEM_H
+#define _ALPHA_RWSEM_H
+
+/*
+ * Written by Ivan Kokshaysky <ink@jurassic.park.msu.ru>, 2001.
+ * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h
+ */
+
+#ifndef _LINUX_RWSEM_H
+#error please dont include asm/rwsem.h directly, use linux/rwsem.h instead
+#endif
+
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct rwsem_waiter;
+
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
+
+/*
+ * the semaphore definition
+ */
+struct rw_semaphore {
+ atomic_t count;
+#define RWSEM_UNLOCKED_VALUE 0x00000000
+#define RWSEM_ACTIVE_BIAS 0x00000001
+#define RWSEM_ACTIVE_MASK 0x0000ffff
+#define RWSEM_WAITING_BIAS (-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+ spinlock_t wait_lock;
+ struct list_head wait_list;
+#if RWSEM_DEBUG
+ int debug;
+#endif
+};
+
+#if RWSEM_DEBUG
+#define __RWSEM_DEBUG_INIT , 0
+#else
+#define __RWSEM_DEBUG_INIT /* */
+#endif
+
+#define __RWSEM_INITIALIZER(name) \
+ { ATOMIC_INIT(RWSEM_UNLOCKED_VALUE), SPIN_LOCK_UNLOCKED, \
+ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
+
+#define DECLARE_RWSEM(name) \
+ struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+
+static inline void init_rwsem(struct rw_semaphore *sem)
+{
+ sem->count.counter = RWSEM_UNLOCKED_VALUE;
+ spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+ sem->debug = 0;
+#endif
+}
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ long count;
+ count = atomic_inc_return(&sem->count);
+ if (count < 0)
+ rwsem_down_read_failed(sem);
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ long prev, cmp;
+ prev = atomic_add_return_prev(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
+ __asm__ __volatile__("cmpeq %1,0,%0\n" : "=r" (cmp) : "r" (prev));
+ if (!cmp)
+ rwsem_down_write_failed(sem);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ long count;
+ count = atomic_dec_return(&sem->count);
+ if (count < 0)
+ if ((count & RWSEM_ACTIVE_MASK) == 0)
+ rwsem_wake(sem);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long count, cmp;
+ count = atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
+ __asm__ __volatile__("cmpeq %1,0,%0\n" : "=r" (cmp) : "r" (count));
+ if (!cmp)
+ if ((count & RWSEM_ACTIVE_MASK) == 0)
+ rwsem_wake(sem);
+}
+
+#define rwsem_atomic_add(val, sem) atomic_add(val, &(sem)->count)
+#define rwsem_atomic_update(val, sem) atomic_add_return(val, &(sem)->count)
+
+#endif /* __KERNEL__ */
+#endif /* _ALPHA_RWSEM_H */
--- 2.4.4/include/asm-alpha/atomic.h Fri Apr 27 20:33:29 2001
+++ linux/include/asm-alpha/atomic.h Fri Apr 27 20:33:41 2001
@@ -70,7 +70,9 @@ static __inline__ long atomic_add_return
" addl %0,%3,%0\n"
" stl_c %0,%1\n"
" beq %0,2f\n"
+#ifdef CONFIG_SMP
" mb\n"
+#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
@@ -88,11 +90,35 @@ static __inline__ long atomic_sub_return
" subl %0,%3,%0\n"
" stl_c %0,%1\n"
" beq %0,2f\n"
+#ifdef CONFIG_SMP
" mb\n"
+#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (temp), "=m" (v->counter), "=&r" (result)
+ :"Ir" (i), "m" (v->counter) : "memory");
+ return result;
+}
+
+/*
+ * Same as above, but return the previous value
+ */
+static __inline__ long atomic_add_return_prev(int i, atomic_t * v)
+{
+ long temp, result;
+ __asm__ __volatile__(
+ "1: ldl_l %0,%1\n"
+ " addl %0,%3,%2\n"
+ " stl_c %2,%1\n"
+ " beq %2,2f\n"
+#ifdef CONFIG_SMP
+ " mb\n"
+#endif
+ ".subsection 2\n"
+ "2: br 1b\n"
+ ".previous"
+ :"=&r" (result), "=m" (v->counter), "=&r" (temp)
:"Ir" (i), "m" (v->counter) : "memory");
return result;
}
--- 2.4.4/include/asm-alpha/semaphore.h Fri Apr 27 20:33:29 2001
+++ linux/include/asm-alpha/semaphore.h Thu May 3 13:04:47 2001
@@ -11,7 +11,6 @@
#include <asm/current.h>
#include <asm/system.h>
#include <asm/atomic.h>
-#include <linux/compiler.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
@@ -92,14 +91,14 @@ extern void __up_wakeup(struct semaphore
static inline void __down(struct semaphore *sem)
{
long count = atomic_dec_return(&sem->count);
- if (__builtin_expect(count < 0, 0))
+ if (count < 0)
__down_failed(sem);
}
static inline int __down_interruptible(struct semaphore *sem)
{
long count = atomic_dec_return(&sem->count);
- if (__builtin_expect(count < 0, 0))
+ if (count < 0)
return __down_failed_interruptible(sem);
return 0;
}
@@ -201,7 +200,7 @@ static inline void __up(struct semaphore
: "m"(*sem), "r"(0x0000000100000000)
: "memory");
- if (__builtin_expect(ret <= 0, 0))
+ if (ret <= 0)
__up_wakeup(sem);
}
--- 2.4.4/arch/alpha/kernel/alpha_ksyms.c Fri Apr 27 20:33:29 2001
+++ linux/arch/alpha/kernel/alpha_ksyms.c Thu May 3 12:32:29 2001
@@ -169,14 +169,12 @@ EXPORT_SYMBOL(__strnlen_user);
EXPORT_SYMBOL(__down_failed);
EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__up_wakeup);
+#if WAITQUEUE_DEBUG || DEBUG_SEMAPHORE
EXPORT_SYMBOL(down);
EXPORT_SYMBOL(down_interruptible);
EXPORT_SYMBOL(down_trylock);
EXPORT_SYMBOL(up);
-EXPORT_SYMBOL(down_read);
-EXPORT_SYMBOL(down_write);
-EXPORT_SYMBOL(up_read);
-EXPORT_SYMBOL(up_write);
+#endif
/*
* SMP-specific symbols.
--- 2.4.4/arch/alpha/kernel/semaphore.c Wed Apr 18 04:19:24 2001
+++ linux/arch/alpha/kernel/semaphore.c Thu May 3 14:41:29 2001
@@ -201,6 +201,8 @@ __up_wakeup(struct semaphore *sem)
wake_up(&sem->wait);
}
+#if WAITQUEUE_DEBUG || DEBUG_SEMAPHORE
+
void
down(struct semaphore *sem)
{
@@ -263,3 +265,5 @@ up(struct semaphore *sem)
#endif
__up(sem);
}
+
+#endif
--- 2.4.4/arch/alpha/config.in Fri Apr 27 20:33:29 2001
+++ linux/arch/alpha/config.in Fri Apr 27 20:33:41 2001
@@ -5,8 +5,8 @@
define_bool CONFIG_ALPHA y
define_bool CONFIG_UID16 n
-define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y
-define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM n
+define_bool CONFIG_RWSEM_GENERIC_SPINLOCK n
+define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM y
mainmenu_name "Kernel configuration of Linux for Alpha machines"
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-03 15:47 [patch] 2.4.4 alpha semaphores optimization Ivan Kokshaysky
@ 2001-05-03 17:28 ` Andrea Arcangeli
2001-05-04 9:15 ` Ivan Kokshaysky
2001-05-04 9:22 ` David Howells
` (2 subsequent siblings)
3 siblings, 1 reply; 15+ messages in thread
From: Andrea Arcangeli @ 2001-05-03 17:28 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel
On Thu, May 03, 2001 at 07:47:47PM +0400, Ivan Kokshaysky wrote:
> Initially I tried to use __builtin_expect in the rwsem.h, but found
> that it doesn't help at all in the small inline functions - it works
> as expected only in a reasonably large block of code. Converting these
> functions into the macros won't help, as callers are inline
> functions also.
> On the other hand, gcc 3.0 generates quite a good code for
> conditional branches (comparisons like value < 0, value == 0
> predicted as false etc.). In the cases where expected value is 0,
> we can use cmpeq instruction.
> Other changes:
> - added atomic_add_return_prev() for __down_write()
> - removed some mb's for non-SMP
> - removed non-inline up()/down_xx() when semaphore/waitqueue debugging
> isn't enabled.
I'd love if you could port it on top of this one and to fix it so that
it can handle up to 2^32 sleepers and not only 2^16 like we have to do
on the 32bit archs to get good performance:
ftp://ftp.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.4aa3/00_rwsem-11
I just wrote the prototype, it only needs to be implemented see
linux/include/asm-alpha/rwsem_xchgadd.h:
--------------------------------------------------------------
#ifndef _ALPHA_RWSEM_XCHGADD_H
#define _ALPHA_RWSEM_XCHGADD_H
/* WRITEME */
static inline void __down_read(struct rw_semaphore *sem)
{
}
static inline void __down_write(struct rw_semaphore *sem)
{
}
static inline void __up_read(struct rw_semaphore *sem)
{
}
static inline void __up_write(struct rw_semaphore *sem)
{
}
static inline long rwsem_xchgadd(long value, long * count)
{
return value;
}
#endif
--------------------------------------------------------------
You only need to fill the above 5 inlined fast paths to make it working
and that's the only thing in the whole alpha tree about the rwsem.
The above patch also provides the fastest write fast path for x86 archs
and the fastest rwsem spinlock based. I didn't yet re-benchmarked the
whole thing yet but still my up_write definitely has to be faster than
the one in 2.4.4 vanilla and the other fast paths have to be the same
speed.
Andrea
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-03 17:28 ` Andrea Arcangeli
@ 2001-05-04 9:15 ` Ivan Kokshaysky
2001-05-04 14:33 ` Andrea Arcangeli
0 siblings, 1 reply; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-04 9:15 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: Richard Henderson, linux-kernel
On Thu, May 03, 2001 at 07:28:48PM +0200, Andrea Arcangeli wrote:
> I'd love if you could port it on top of this one and to fix it so that
> it can handle up to 2^32 sleepers and not only 2^16 like we have to do
> on the 32bit archs to get good performance:
>
> ftp://ftp.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.4aa3/00_rwsem-11
It could be done without much pain for both "official" and your rwsem
implementations.
However, there are 3 reasons why I prefer 16-bit counters:
a. "max user processes" ulimit is much lower than 64K anyway;
b. "long" count would cost extra 8 bytes in the struct rw_semaphore;
c. I can use existing atomic routines which deal with ints.
Actually I'm more anxious about a __builtin_expect() problem,
and I'd like to hear Richard's comment on this...
Ivan.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 9:15 ` Ivan Kokshaysky
@ 2001-05-04 14:33 ` Andrea Arcangeli
2001-05-04 17:02 ` Ivan Kokshaysky
0 siblings, 1 reply; 15+ messages in thread
From: Andrea Arcangeli @ 2001-05-04 14:33 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel
On Fri, May 04, 2001 at 01:15:28PM +0400, Ivan Kokshaysky wrote:
> However, there are 3 reasons why I prefer 16-bit counters:
I assume you mean 32bit counter. (that gives max 2^16 sleepers)
> a. "max user processes" ulimit is much lower than 64K anyway;
the 2^16 limit is not a per-user limit it is a global one so the max
user process ulimit is irrelevant.
Only the number of pid and the max number of tasks supported by the
architecture is a relevant limit for this.
> b. "long" count would cost extra 8 bytes in the struct rw_semaphore;
correct but that's the "feature" to be able to support 2^32 concurrent
sleepers at not relevant runtime cost 8).
> c. I can use existing atomic routines which deal with ints.
I was thinking at a dedicated routine that implements the slow path by
hand as well like x86 just do. Then using ldq instead of ldl isn't
really a big deal programmer wise.
Andrea
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 14:33 ` Andrea Arcangeli
@ 2001-05-04 17:02 ` Ivan Kokshaysky
2001-05-04 17:16 ` Andrea Arcangeli
0 siblings, 1 reply; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-04 17:02 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: linux-kernel
[-- Attachment #1: Type: text/plain, Size: 1195 bytes --]
On Fri, May 04, 2001 at 04:33:59PM +0200, Andrea Arcangeli wrote:
> the 2^16 limit is not a per-user limit it is a global one so the max
> user process ulimit is irrelevant.
>
> Only the number of pid and the max number of tasks supported by the
> architecture is a relevant limit for this.
Thanks for the correction. I thought about a case where one user could
exhaust 2^16 limit.
> > b. "long" count would cost extra 8 bytes in the struct rw_semaphore;
>
> correct but that's the "feature" to be able to support 2^32 concurrent
> sleepers at not relevant runtime cost 8).
But I can't imagine how this "feature" could be useful in a real life :-)
> > c. I can use existing atomic routines which deal with ints.
>
> I was thinking at a dedicated routine that implements the slow path by
> hand as well like x86 just do. Then using ldq instead of ldl isn't
> really a big deal programmer wise.
You meant "the fast path", I guess? Then it's true. However with those
atomic functions code is much more readable.
Anyway, I've attached asm-alpha/rwsem_xchgadd.h for your implementation.
However I got processes in D state early on boot with it -- maybe
I've made a typo somewhere...
Ivan.
[-- Attachment #2: rwsem_xchgadd.h --]
[-- Type: text/plain, Size: 2177 bytes --]
#ifndef _ALPHA_RWSEM_XCHGADD_H
#define _ALPHA_RWSEM_XCHGADD_H
#include <asm/types.h> /* BITS_PER_LONG */
static inline void __down_read(struct rw_semaphore *sem)
{
long count, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,1,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (count), "=m" (sem->count), "=&r" (temp)
:"m" (sem->count) : "memory");
if (count < 0)
rwsem_down_failed(sem, RWSEM_READ_BLOCKING_BIAS);
}
static inline void __down_write(struct rw_semaphore *sem)
{
long granted, temp = RWSEM_WRITE_BIAS + RWSEM_READ_BIAS;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%2,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
#endif
" cmpeq %0,0,%0\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (granted), "=m" (sem->count), "=&r" (temp)
:"2" (temp),"m" (sem->count) : "memory");
if (!granted)
rwsem_down_failed(sem, RWSEM_WRITE_BLOCKING_BIAS);
}
static inline void __up_read(struct rw_semaphore *sem)
{
long oldcount, temp;
__asm__ __volatile__(
#ifdef CONFIG_SMP
" mb\n"
#endif
"1: ldq_l %0,%1\n"
" subq %0,1,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
" subl %0,1,%2\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
:"m" (sem->count) : "memory");
if (oldcount < 0 && temp == 0)
rwsem_wake(sem);
}
static inline void __up_write(struct rw_semaphore *sem)
{
long count, temp = RWSEM_READ_BIAS + RWSEM_WRITE_BIAS;
__asm__ __volatile__(
#ifdef CONFIG_SMP
" mb\n"
#endif
"1: ldq_l %0,%1\n"
" subq %0,%2,%2\n"
" mov %2,%0\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (count), "=m" (sem->count), "=&r" (temp)
:"2" (temp), "m" (sem->count) : "memory");
if (count < 0)
rwsem_wake(sem);
}
static inline long rwsem_xchgadd(long value, long * count)
{
long ret, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (ret), "=m" (count), "=&r" (temp)
:"Ir" (value), "m" (count) : "memory");
return ret;
}
#endif
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 17:02 ` Ivan Kokshaysky
@ 2001-05-04 17:16 ` Andrea Arcangeli
0 siblings, 0 replies; 15+ messages in thread
From: Andrea Arcangeli @ 2001-05-04 17:16 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: linux-kernel
On Fri, May 04, 2001 at 09:02:33PM +0400, Ivan Kokshaysky wrote:
> But I can't imagine how this "feature" could be useful in a real life :-)
It will be required by the time we can fork more than 2^16 tasks (which
I'm wondering if it could be just the case if you use CLONE_PID as
root, I didn't checked the code yet to be sure).
> You meant "the fast path", I guess? Then it's true. However with those
Yes, I guess the slow path is quite painful to maintain, however I'd add
at least the __builtin_expect() so it gets optimized by 2.96 and 3.[01].
> atomic functions code is much more readable.
Your attached code is nice enough IMHO ;).
> Anyway, I've attached asm-alpha/rwsem_xchgadd.h for your implementation.
Sweet, thanks.
> However I got processes in D state early on boot with it -- maybe
> I've made a typo somewhere...
It has to be a bug in a non contention case then, or maybe you run some
threaded app during boot? Note that my version is a bit different than
David's one, my fast path has less requirements in up_write and so it
can be implemented with less instructions. I will check and integrate
your code soon into my patch, thanks. If you find the bug meanwhile let
me know (to beat it hard you can use my userspace threaded app that
faults and mmap/munmap in loop from dozen of threads).
Andrea
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-03 15:47 [patch] 2.4.4 alpha semaphores optimization Ivan Kokshaysky
2001-05-03 17:28 ` Andrea Arcangeli
@ 2001-05-04 9:22 ` David Howells
2001-05-04 9:54 ` Ivan Kokshaysky
2001-05-04 16:46 ` Ivan Kokshaysky
2001-05-04 21:12 ` Richard Henderson
2001-05-04 21:13 ` Richard Henderson
3 siblings, 2 replies; 15+ messages in thread
From: David Howells @ 2001-05-04 9:22 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: linux-kernel
Hello Ivan,
One reason I picked "signed long" as the count type in the lib/rwsem.c is that
this would be 64 bits on a 64-bit arch such as the alpha.
So I've taken your idea for include/asm-alpha/rwsem.h and modified it a
little. You'll find it attached at the bottom.
I don't know whether it will (a) compile, or (b) work... I don't have an alpha
to play with.
I also don't know the alpha function calling convention, so I can't put direct
calls to the fallback routines in lib/rwsem.c from the ".subsection 2"
bits. Can you do that, or can you tell me how the calling convention works?
Cheers,
David
===============================================================================
#ifndef _ALPHA_RWSEM_H
#define _ALPHA_RWSEM_H
/*
* Written by Ivan Kokshaysky <ink@jurassic.park.msu.ru>, 2001.
* Based on asm-alpha/semaphore.h and asm-i386/rwsem.h
*/
#ifndef _LINUX_RWSEM_H
#error please dont include asm/rwsem.h directly, use linux/rwsem.h instead
#endif
#ifdef __KERNEL__
#include <linux/list.h>
#include <linux/spinlock.h>
struct rwsem_waiter;
extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
/*
* the semaphore definition
*/
struct rw_semaphore {
signed long count;
#define RWSEM_UNLOCKED_VALUE 0x0000000000000000
#define RWSEM_ACTIVE_BIAS 0x0000000000000001
#define RWSEM_ACTIVE_MASK 0x00000000ffffffff
#define RWSEM_WAITING_BIAS (-0x0000000100000000)
#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
spinlock_t wait_lock;
struct list_head wait_list;
#if RWSEM_DEBUG
int debug;
#endif
};
#if RWSEM_DEBUG
#define __RWSEM_DEBUG_INIT , 0
#else
#define __RWSEM_DEBUG_INIT /* */
#endif
#define __RWSEM_INITIALIZER(name) \
{ ATOMIC_INIT(RWSEM_UNLOCKED_VALUE), SPIN_LOCK_UNLOCKED, \
LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)
static inline void init_rwsem(struct rw_semaphore *sem)
{
sem->count = RWSEM_UNLOCKED_VALUE;
spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#if RWSEM_DEBUG
sem->debug = 0;
#endif
}
static inline void __down_read(struct rw_semaphore *sem)
{
signed long oldcount, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory");
if (oldcount < 0)
rwsem_down_read_failed(sem);
}
static inline void __down_write(struct rw_semaphore *sem)
{
signed long granted, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
" cmpeq %0,0,%0\n"
#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (granted), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory");
if (!granted)
rwsem_down_write_failed(sem);
}
static inline void __up_read(struct rw_semaphore *sem)
{
signed long oldcount, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" subq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory");
if (oldcount < 0)
if ((count & RWSEM_ACTIVE_MASK) == 0)
rwsem_wake(sem);
}
static inline void __up_write(struct rw_semaphore *sem)
{
signed long count, cmp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" subq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
" cmpeq %0,%3,%2\n"
" subq %0,%3,%0\n"
#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (count), "=m" (sem->count), "=&r" (cmp)
:"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory");
if (!cmp)
if ((count & RWSEM_ACTIVE_MASK) == 0)
rwsem_wake(sem);
}
#define rwsem_atomic_add(val, sem) atomic_add(val, &(sem)->count)
#define rwsem_atomic_update(val, sem) atomic_add_return(val, &(sem)->count)
#endif /* __KERNEL__ */
#endif /* _ALPHA_RWSEM_H */
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 9:22 ` David Howells
@ 2001-05-04 9:54 ` Ivan Kokshaysky
2001-05-04 16:46 ` Ivan Kokshaysky
1 sibling, 0 replies; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-04 9:54 UTC (permalink / raw)
To: David Howells; +Cc: linux-kernel
On Fri, May 04, 2001 at 10:22:53AM +0100, David Howells wrote:
> Hello Ivan,
Hello David!
> I don't know whether it will (a) compile, or (b) work... I don't have an alpha
> to play with.
It looks ok at a first glance, I can try it today.
> I also don't know the alpha function calling convention, so I can't put direct
> calls to the fallback routines in lib/rwsem.c from the ".subsection 2"
> bits. Can you do that, or can you tell me how the calling convention works?
Calling C routines from inline asm is quite painful on alpha. Lots of
registers will be clobbered, so you need some wrapper functions preserving
them. It was done in 2.2 this way, but that code was hardly maintainable...
Ivan.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 9:22 ` David Howells
2001-05-04 9:54 ` Ivan Kokshaysky
@ 2001-05-04 16:46 ` Ivan Kokshaysky
1 sibling, 0 replies; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-04 16:46 UTC (permalink / raw)
To: David Howells; +Cc: linux-kernel
[-- Attachment #1: Type: text/plain, Size: 765 bytes --]
On Fri, May 04, 2001 at 10:22:53AM +0100, David Howells wrote:
> I don't know whether it will (a) compile, or (b) work... I don't have an alpha
> to play with.
Neither (a) nor (b) ;-) Corrected asm-alpha/rwsem.h attached.
Also small fix for lib/rwsem.c -- RWSEM_WAITING_BIAS-RWSEM_ACTIVE_BIAS
won't fit in the __s32 if counters are 64-bit.
--- linux/lib/rwsem.c.orig Sat Apr 28 00:58:28 2001
+++ linux/lib/rwsem.c Fri May 4 17:38:06 2001
@@ -112,7 +112,7 @@ static inline struct rw_semaphore *__rws
*/
static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore *sem,
struct rwsem_waiter *waiter,
- __s32 adjustment)
+ signed long adjustment)
{
struct task_struct *tsk = current;
signed long count;
Ivan.
[-- Attachment #2: rwsem.h --]
[-- Type: text/plain, Size: 4105 bytes --]
#ifndef _ALPHA_RWSEM_H
#define _ALPHA_RWSEM_H
/*
* Written by Ivan Kokshaysky <ink@jurassic.park.msu.ru>, 2001.
* Based on asm-alpha/semaphore.h and asm-i386/rwsem.h
*/
#ifndef _LINUX_RWSEM_H
#error please dont include asm/rwsem.h directly, use linux/rwsem.h instead
#endif
#ifdef __KERNEL__
#include <linux/list.h>
#include <linux/spinlock.h>
struct rwsem_waiter;
extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
/*
* the semaphore definition
*/
struct rw_semaphore {
long count;
#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L
#define RWSEM_ACTIVE_BIAS 0x0000000000000001L
#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL
#define RWSEM_WAITING_BIAS (-0x0000000100000000L)
#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
spinlock_t wait_lock;
struct list_head wait_list;
#if RWSEM_DEBUG
int debug;
#endif
};
#if RWSEM_DEBUG
#define __RWSEM_DEBUG_INIT , 0
#else
#define __RWSEM_DEBUG_INIT /* */
#endif
#define __RWSEM_INITIALIZER(name) \
{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)
static inline void init_rwsem(struct rw_semaphore *sem)
{
sem->count = RWSEM_UNLOCKED_VALUE;
spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#if RWSEM_DEBUG
sem->debug = 0;
#endif
}
static inline void __down_read(struct rw_semaphore *sem)
{
long oldcount, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
#endif
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory");
if (oldcount < 0)
rwsem_down_read_failed(sem);
}
static inline void __down_write(struct rw_semaphore *sem)
{
long granted, temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
#ifdef CONFIG_SMP
" mb\n"
#endif
" cmpeq %0,0,%0\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (granted), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory");
if (!granted)
rwsem_down_write_failed(sem);
}
static inline void __up_read(struct rw_semaphore *sem)
{
long oldcount, temp;
__asm__ __volatile__(
#ifdef CONFIG_SMP
" mb\n"
#endif
"1: ldq_l %0,%1\n"
" subq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory");
if (oldcount < 0)
if ((oldcount & RWSEM_ACTIVE_MASK) == 0)
rwsem_wake(sem);
}
static inline void __up_write(struct rw_semaphore *sem)
{
long count, cmp;
__asm__ __volatile__(
#ifdef CONFIG_SMP
" mb\n"
#endif
"1: ldq_l %0,%1\n"
" subq %0,%3,%2\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
" cmpeq %0,%3,%2\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (count), "=m" (sem->count), "=&r" (cmp)
:"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory");
if (!cmp)
if ((int)count - RWSEM_ACTIVE_BIAS == 0)
rwsem_wake(sem);
}
static inline void rwsem_atomic_add(long val, struct rw_semaphore *sem)
{
long temp;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%2,%0\n"
" stq_c %0,%1\n"
" beq %0,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (temp), "=m" (sem->count)
:"Ir" (val), "m" (sem->count));
}
static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
{
long temp, ret;
__asm__ __volatile__(
"1: ldq_l %0,%1\n"
" addq %0,%3,%2\n"
" addq %0,%3,%0\n"
" stq_c %2,%1\n"
" beq %2,2f\n"
".subsection 2\n"
"2: br 1b\n"
".previous"
:"=&r" (ret), "=m" (sem->count), "=&r" (temp)
:"Ir" (val), "m" (sem->count));
return ret;
}
#endif /* __KERNEL__ */
#endif /* _ALPHA_RWSEM_H */
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-03 15:47 [patch] 2.4.4 alpha semaphores optimization Ivan Kokshaysky
2001-05-03 17:28 ` Andrea Arcangeli
2001-05-04 9:22 ` David Howells
@ 2001-05-04 21:12 ` Richard Henderson
2001-05-05 13:55 ` Ivan Kokshaysky
2001-05-04 21:13 ` Richard Henderson
3 siblings, 1 reply; 15+ messages in thread
From: Richard Henderson @ 2001-05-04 21:12 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: linux-kernel
On Thu, May 03, 2001 at 07:47:47PM +0400, Ivan Kokshaysky wrote:
> - removed some mb's for non-SMP
This isn't correct. Either you need atomic updates or you don't.
If you don't, then you shouldn't be using ll/sc at all. If you do
(perhaps to coordinate with devices) then the barriers are required.
> - removed non-inline up()/down_xx() when semaphore/waitqueue debugging
> isn't enabled.
They should still be exported for module compatibility.
r~
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 21:12 ` Richard Henderson
@ 2001-05-05 13:55 ` Ivan Kokshaysky
2001-05-06 6:55 ` Ivan Kokshaysky
0 siblings, 1 reply; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-05 13:55 UTC (permalink / raw)
To: Richard Henderson; +Cc: linux-kernel
On Fri, May 04, 2001 at 02:12:40PM -0700, Richard Henderson wrote:
> > - removed some mb's for non-SMP
>
> This isn't correct. Either you need atomic updates or you don't.
> If you don't, then you shouldn't be using ll/sc at all.
I don't think so. On a single CPU system we need atomic updates
to protect modifying of critical variables from interrupts, and
ll/sc sequences guarantee exactly that. But on UP system we don't
need memory barriers of any kind (I mean system memory space accesses,
not IO, of course), as we don't care about read/write ordering at all.
> If you do
> (perhaps to coordinate with devices) then the barriers are required.
For IO space access mb's are required, but ll/sc are of no use, AFAIK.
However, if I understand correctly, the r/w semaphores can't be used from
interrupt context, so in this case I'd agree -- this stuff could be made
completely non-atomic for UP.
> > - removed non-inline up()/down_xx() when semaphore/waitqueue debugging
> > isn't enabled.
>
> They should still be exported for module compatibility.
If you mean some external modules, then ok. The modules built from the
main tree shouldn't have any problems...
Ivan.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-05 13:55 ` Ivan Kokshaysky
@ 2001-05-06 6:55 ` Ivan Kokshaysky
0 siblings, 0 replies; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-06 6:55 UTC (permalink / raw)
To: Richard Henderson; +Cc: linux-kernel
> > If you do
> > (perhaps to coordinate with devices) then the barriers are required.
>
> For IO space access mb's are required, but ll/sc are of no use, AFAIK.
Ugh. You are right, of course. I forgot that drivers are also using
atomic.h, and the intelligent device could be counted as another CPU
to some degree...
Thanks for the __builtin_expect fix!
Ivan.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-03 15:47 [patch] 2.4.4 alpha semaphores optimization Ivan Kokshaysky
` (2 preceding siblings ...)
2001-05-04 21:12 ` Richard Henderson
@ 2001-05-04 21:13 ` Richard Henderson
2001-05-05 14:17 ` Ivan Kokshaysky
3 siblings, 1 reply; 15+ messages in thread
From: Richard Henderson @ 2001-05-04 21:13 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: linux-kernel
On Thu, May 03, 2001 at 07:47:47PM +0400, Ivan Kokshaysky wrote:
> Initially I tried to use __builtin_expect in the rwsem.h, but found
> that it doesn't help at all in the small inline functions - it works
> as expected only in a reasonably large block of code.
Eh? Would you give me an example that isn't working properly?
r~
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [patch] 2.4.4 alpha semaphores optimization
2001-05-04 21:13 ` Richard Henderson
@ 2001-05-05 14:17 ` Ivan Kokshaysky
2001-05-05 17:06 ` __builtin_expect vs inlining Richard Henderson
0 siblings, 1 reply; 15+ messages in thread
From: Ivan Kokshaysky @ 2001-05-05 14:17 UTC (permalink / raw)
To: Richard Henderson; +Cc: linux-kernel
On Fri, May 04, 2001 at 02:13:18PM -0700, Richard Henderson wrote:
> Eh? Would you give me an example that isn't working properly?
Sure.
bar.c:
-----------------
extern void rarely_executed_code(void);
static inline void foo_no_be(void)
{
int ret;
__asm__ __volatile__("nop\n": "=r" (ret));
if (ret < 0)
rarely_executed_code();
}
static inline void foo(void)
{
int ret;
__asm__ __volatile__("unop\n": "=r" (ret));
if (__builtin_expect(ret < 0, 0))
rarely_executed_code();
}
#define foo_macro() ({ \
int ret; \
__asm__ __volatile__("fnop\n": "=r" (ret)); \
if (__builtin_expect(ret < 0, 0)) \
rearly_executed_code(); \
})
void bar(void)
{
foo_no_be();
foo();
foo_macro();
}
---------------
bar.s, compiled with 'gcc -O2 -S bar.c':
--------------
[...]
$bar..ng:
lda $30,-16($30)
stq $26,0($30)
.prologue 1
nop
.align 3 #realign
addl $1,$31,$1
blt $1,$L12 # (ret < 0) predicted to be "false", and gcc
# put the code out of line quite nicely.
$L8:
unop
.align 3 #realign
srl $1,31,$1
blbc $1,$L10 # Oops. The slow path code is in line...
jsr $26,rarely_executed_code
ldgp $29,0($26)
$L10:
fnop
.align 3 #realign
srl $1,31,$1
blbs $1,$L13 # This works.
$L11:
ldq $26,0($30)
nop
lda $30,16($30)
ret $31,($26),1
.align 4
$L13:
jsr $26,rearly_executed_code
ldgp $29,0($26)
br $31,$L11
.align 4
$L12:
jsr $26,rarely_executed_code
ldgp $29,0($26)
br $31,$L8
.end bar
.ident "GCC: (GNU) 3.0 20010430 (prerelease)"
---------------
So one of the questions: can one rely on current branch predictions
algorithms (val < 0, val = 0 false etc.) in the long term?
Ivan.
^ permalink raw reply [flat|nested] 15+ messages in thread* __builtin_expect vs inlining
2001-05-05 14:17 ` Ivan Kokshaysky
@ 2001-05-05 17:06 ` Richard Henderson
0 siblings, 0 replies; 15+ messages in thread
From: Richard Henderson @ 2001-05-05 17:06 UTC (permalink / raw)
To: Ivan Kokshaysky; +Cc: linux-kernel, gcc-patches
On Sat, May 05, 2001 at 06:17:18PM +0400, Ivan Kokshaysky wrote:
> > Eh? Would you give me an example that isn't working properly?
>
> Sure.
Fixed thus.
> So one of the questions: can one rely on current branch predictions
> algorithms (val < 0, val = 0 false etc.) in the long term?
Err, no. We reserve the right to tweek the predictions, or to replace
them with different heuristics. I'd hope they'd be _generally_ better
heuristics, though the effect on any one particular test might change.
r~
* integrate.c (copy_insn_list): Substitute NOTE_EXPECTED_VALUE.
Index: integrate.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/integrate.c,v
retrieving revision 1.142
diff -c -p -r1.142 integrate.c
*** integrate.c 2001/05/03 16:14:34 1.142
--- integrate.c 2001/05/05 16:54:24
*************** copy_insn_list (insns, map, static_chain
*** 1536,1541 ****
--- 1536,1546 ----
else
NOTE_BLOCK (copy) = *mapped_block_p;
}
+ else if (copy
+ && NOTE_LINE_NUMBER (copy) == NOTE_INSN_EXPECTED_VALUE)
+ NOTE_EXPECTED_VALUE (copy)
+ = copy_rtx_and_substitute (NOTE_EXPECTED_VALUE (insn),
+ map, 0);
}
else
copy = 0;
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2001-05-06 6:56 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2001-05-03 15:47 [patch] 2.4.4 alpha semaphores optimization Ivan Kokshaysky
2001-05-03 17:28 ` Andrea Arcangeli
2001-05-04 9:15 ` Ivan Kokshaysky
2001-05-04 14:33 ` Andrea Arcangeli
2001-05-04 17:02 ` Ivan Kokshaysky
2001-05-04 17:16 ` Andrea Arcangeli
2001-05-04 9:22 ` David Howells
2001-05-04 9:54 ` Ivan Kokshaysky
2001-05-04 16:46 ` Ivan Kokshaysky
2001-05-04 21:12 ` Richard Henderson
2001-05-05 13:55 ` Ivan Kokshaysky
2001-05-06 6:55 ` Ivan Kokshaysky
2001-05-04 21:13 ` Richard Henderson
2001-05-05 14:17 ` Ivan Kokshaysky
2001-05-05 17:06 ` __builtin_expect vs inlining Richard Henderson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox