[PATCH 00/12] rwsem changes + down_read

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 00/12] rwsem changes + down_read_unfair() proposal
@ 2010-05-12  3:20 Michel Lespinasse
  2010-05-12  3:20 ` [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code Michel Lespinasse
                   ` (11 more replies)
  0 siblings, 12 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

Hi,

I would like to sollicit comments regarding the following changes
against 2.6.34-rc7. The motivation for this change was some cluster
monitoring software we use at google; which reads /proc/<pid>/maps files
for all running processes. When the machines are under load, the mmap_sem
is often acquire for reads for long periods of time since do_page_fault()
holds it while doing disk accesses; and fair queueing behavior often ends
up in the monitoring software making little progress. By introducing unfair
behavior in a few selected places, are are able to let the monitoring
software make progress without impacting performance for the rest of
the system.

In general, I've made sure to implement this proposal without touching the
rwsem fast paths. Also, the first 8 patches of this series should be of
general applicability even if not taking the down_read_unfair() changes,
addressing minor issues such as situations where reader threads can get
blocked at the head of the waiting list even though the rwsem is currently
owned for reads.

Michel Lespinasse (12):
  rwsem: test for no active locks in __rwsem_do_wake undo code
  rwsem: use single atomic update for sem count when waking up readers
  rwsem: let RWSEM_WAITING_BIAS represent any number of waiting threads
  rwsem: consistently use adjustment variable
  x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics
  rwsem: wake queued readers when other readers are active
  rwsem: wake queued readers when writer blocks on active read lock
  rwsem: smaller wrappers around rwsem_down_failed_common
  generic rwsem: implement down_read_unfair
  rwsem: down_read_unfair infrastructure support
  x86 rwsem: down_read_unfair implementation
  Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files

 arch/x86/include/asm/rwsem.h   |   66 ++++++++++++++-----
 arch/x86/lib/rwsem_64.S        |   14 +++-
 arch/x86/lib/semaphore_32.S    |   21 +++++-
 fs/proc/base.c                 |    2 +-
 fs/proc/task_mmu.c             |    2 +-
 fs/proc/task_nommu.c           |    2 +-
 include/linux/rwsem-spinlock.h |   10 +++-
 include/linux/rwsem.h          |   10 +++
 kernel/rwsem.c                 |   17 +++++
 lib/rwsem-spinlock.c           |   10 ++-
 lib/rwsem.c                    |  145 ++++++++++++++++++++++++---------------
 11 files changed, 213 insertions(+), 86 deletions(-)


Also following here is the down_read_unfair.c test mentionned in
last patch of the series:

#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

struct thread_data {
	unsigned long count_A, count_B, count_C;
	int done;
	int pid;
	char *mapped;
	unsigned long mapped_size;
	int dummy;
	int barrier_count;
	pthread_mutex_t barrier_mutex;
	pthread_cond_t barrier_cond;
};

static void barrier(struct thread_data *data)
{
	pthread_mutex_lock(&data->barrier_mutex);
	if (--(data->barrier_count) == 0)
		pthread_cond_broadcast(&data->barrier_cond);
	while (data->barrier_count)
		pthread_cond_wait(&data->barrier_cond, &data->barrier_mutex);
	pthread_mutex_unlock(&data->barrier_mutex);
}

static void *func_A(void *tmp)
{
	int dummy = 0;
	struct thread_data *data = tmp;

	srand48(time(NULL));
        barrier(data);

	while (!data->done) {
		unsigned long pos = (lrand48() << 31) | lrand48();
		dummy += data->mapped[pos % data->mapped_size];
		data->count_A++;
        }

	data->dummy = dummy;
	return NULL;
}

static void *func_B(void *tmp)
{
	char *dummy;
	struct thread_data *data = tmp;

        barrier(data);

	while (!data->done) {
		dummy = mmap(NULL, 1, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS,
			     -1, 0);
		if (dummy == MAP_FAILED || munmap(dummy, 1)) break;
		data->count_B++;
	}

	return NULL;
}

static void *func_C(void *tmp)
{
	char procfile[64];
	struct thread_data *data = tmp;
	int fd, size;
	char buf[4096];

	size = sprintf(procfile, "/proc/%d/maps", data->pid);
        barrier(data);
	if (size < 0) return NULL;

	while (!data->done) {
		fd = open(procfile, O_RDONLY);
		if (fd < 0) break;
		do {
			size = read(fd, buf, sizeof buf);
		} while (size == sizeof buf);
		if (close(fd) || size < 0) break;
		data->count_C++;
	}

	return NULL;
}

int main(int argc, char **argv)
{
	int fd;
	struct stat sb;
        char *mapped;
	struct thread_data data;
	pthread_t thread_A, thread_B, thread_C;

	if (argc != 2) {
		fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
		exit(1);
	}

	fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		fprintf(stderr, "open %s: %s\n", argv[1], strerror(errno));
		exit(1);
	}
	if (fstat(fd, &sb) < 0) {
		fprintf(stderr, "fstat: %s\n", strerror(errno));
		exit(1);
	}
        mapped = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
        if (mapped == MAP_FAILED) {
		fprintf(stderr, "mmap: %s\n", strerror(errno));
		exit(1);
        }
	if (close(fd)) {
		fprintf(stderr, "close: %s\n", strerror(errno));
		exit(1);
	}

	data.count_A = data.count_B = data.count_C = 0;
	data.done = 0;
	data.pid = getpid();
	data.mapped = mapped;
	data.mapped_size = sb.st_size;
	data.dummy = 0;
	data.barrier_count = 4;
        if (pthread_mutex_init(&data.barrier_mutex, NULL) ||
            pthread_cond_init(&data.barrier_cond, NULL)) {
		fprintf(stderr, "pthread_*_init: %s\n", strerror(errno));
		exit(1);
        }

	if (pthread_create(&thread_A, NULL, func_A, &data) ||
	    pthread_create(&thread_B, NULL, func_B, &data) ||
	    pthread_create(&thread_C, NULL, func_C, &data)) {
		fprintf(stderr, "pthread_create: %s\n", strerror(errno));
                exit(1);
	}
        barrier(&data);
	sleep(10);
	data.done = 1;
	if (pthread_join(thread_A, NULL) ||
	    pthread_join(thread_B, NULL) ||
	    pthread_join(thread_C, NULL)) {
		fprintf(stderr, "pthread_join: %s\n", strerror(errno));
	}
	printf ("Counts: %lu A, %lu B, %lu C\n",
		data.count_A, data.count_B, data.count_C);
	return 0;
}

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 10:39   ` David Howells
  2010-05-12  3:20 ` [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers Michel Lespinasse
                   ` (10 subsequent siblings)
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

If there are no active threasd using a semaphore, it is always correct to
unqueue blocked threads. This seems to be what was intended in the undo code.

What was done instead, was to look for a sem count of zero - this is an
impossible situation, given that at least one thread is known to be queued
on the semaphore. The code might be correct as written, but it's hard to
reason about and it's not what was intended (otherwise the goto out would
have been unconditional).

Go for checking the active count - the alternative is not worth the headache.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 3e3365e..8d6a13e 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -138,7 +138,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
 	/* undo the change to count, but check for a transition 1->0 */
  undo:
-	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) != 0)
+	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
 		goto out;
 	goto try_again;
 }
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers.
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
  2010-05-12  3:20 ` [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 11:01   ` David Howells
  2010-05-12 11:36   ` David Howells
  2010-05-12  3:20 ` [PATCH 03/12] rwsem: let RWSEM_WAITING_BIAS represent any number of waiting threads Michel Lespinasse
                   ` (9 subsequent siblings)
  11 siblings, 2 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

When waking up queued readers, __rwsem_do_wake was using a pair of atomic
operations (one before looking at the first queued thread and one after
having looped through all readers). Restructure code to use a single
atomic operation instead.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |   52 ++++++++++++++++++++++++++--------------------------
 1 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 8d6a13e..caee9b7 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -41,7 +41,7 @@ struct rwsem_waiter {
  * - if we come here from up_xxxx(), then:
  *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
  *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- *   - there must be someone on the queue
+ * - there must be someone on the queue
  * - the spinlock must be held by the caller
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if downgrading is false
@@ -54,26 +54,26 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	struct list_head *next;
 	signed long oldcount, woken, loop;
 
+	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
+		goto readers_only;
+
 	if (downgrading)
-		goto dont_wake_writers;
+		/* Caller's lock is still active, so we can't possibly
+		 * succeed waking writers.
+		 */
+		goto out;
 
-	/* if we came through an up_xxxx() call, we only only wake someone up
+	/* There's a writer at the front of the queue - try to grant it the
+	 * write lock. However, we only only wake someone up
 	 * if we can transition the active part of the count from 0 -> 1
 	 */
- try_again:
+ retry_writer:
 	oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem)
 						- RWSEM_ACTIVE_BIAS;
 	if (oldcount & RWSEM_ACTIVE_MASK)
-		goto undo;
-
-	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
-	/* try to grant a single write lock if there's a writer at the front
-	 * of the queue - note we leave the 'active part' of the count
-	 * incremented by 1 and the waiting part incremented by 0x00010000
-	 */
-	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
-		goto readers_only;
+		/* Someone grabbed the sem already */
+		goto undo_writer;
 
 	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
 	 * It is an allocated on the waiter's stack and may become invalid at
@@ -87,12 +87,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	put_task_struct(tsk);
 	goto out;
 
-	/* don't want to wake any writers */
- dont_wake_writers:
-	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
-		goto out;
-
 	/* grant an infinite number of read locks to the readers at the front
 	 * of the queue
 	 * - note we increment the 'active part' of the count by the number of
@@ -113,11 +107,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
 	loop = woken;
 	woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS;
-	if (!downgrading)
-		/* we'd already done one increment earlier */
-		woken -= RWSEM_ACTIVE_BIAS;
 
-	rwsem_atomic_add(woken, sem);
+ retry_readers:
+	oldcount = rwsem_atomic_update(woken, sem) - woken;
+	if (!downgrading && (oldcount & RWSEM_ACTIVE_MASK))
+		/* Someone grabbed the sem already */
+		goto undo_readers;
 
 	next = sem->wait_list.next;
 	for (; loop > 0; loop--) {
@@ -137,10 +132,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	return sem;
 
 	/* undo the change to count, but check for a transition 1->0 */
- undo:
+ undo_writer:
 	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
 		goto out;
-	goto try_again;
+	goto retry_writer;
+
+ undo_readers:
+	if (rwsem_atomic_update(-woken, sem) & RWSEM_ACTIVE_MASK)
+		goto out;
+	goto retry_readers;
 }
 
 /*
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 03/12] rwsem: let RWSEM_WAITING_BIAS represent any number of waiting threads
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
  2010-05-12  3:20 ` [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code Michel Lespinasse
  2010-05-12  3:20 ` [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12  3:20 ` [PATCH 04/12] rwsem: consistently use adjustment variable Michel Lespinasse
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

Previously each waiting thread added a bias of RWSEM_WAITING_BIAS. With this
change, the bias is added only once when the wait list is non-empty.

This has a few nice properties which will be used in following changes:
- when the mutex is held and the waiter list is known to be non-empty,
  count < RWSEM_WAITING_BIAS  <=>  there is an active writer on that sem
- count == RWSEM_WAITING_BIAS  <=>  there are waiting threads and no
                                     active readers/writers on that sem

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |   23 +++++++++++++++--------
 1 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index caee9b7..0fb6e38 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -52,7 +52,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop;
+	signed long oldcount, woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -68,9 +68,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	 * write lock. However, we only only wake someone up
 	 * if we can transition the active part of the count from 0 -> 1
 	 */
+	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+	if (waiter->list.next == &sem->wait_list)
+		adjustment -= RWSEM_WAITING_BIAS;
+
  retry_writer:
-	oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem)
-						- RWSEM_ACTIVE_BIAS;
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
 	if (oldcount & RWSEM_ACTIVE_MASK)
 		/* Someone grabbed the sem already */
 		goto undo_writer;
@@ -106,7 +109,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	} while (waiter->flags & RWSEM_WAITING_FOR_READ);
 
 	loop = woken;
-	woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS;
+	woken *= RWSEM_ACTIVE_READ_BIAS;
+	if (waiter->flags & RWSEM_WAITING_FOR_READ)
+		/* hit end of list above */
+		woken -= RWSEM_WAITING_BIAS;
 
  retry_readers:
 	oldcount = rwsem_atomic_update(woken, sem) - woken;
@@ -133,7 +139,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
 	/* undo the change to count, but check for a transition 1->0 */
  undo_writer:
-	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
+	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
 		goto out;
 	goto retry_writer;
 
@@ -160,6 +166,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	waiter->task = tsk;
 	get_task_struct(tsk);
 
+	if (list_empty(&sem->wait_list))
+		adjustment += RWSEM_WAITING_BIAS;
 	list_add_tail(&waiter->list, &sem->wait_list);
 
 	/* we're now waiting on the lock, but no longer actively read-locking */
@@ -193,8 +201,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 	struct rwsem_waiter waiter;
 
 	waiter.flags = RWSEM_WAITING_FOR_READ;
-	rwsem_down_failed_common(sem, &waiter,
-				RWSEM_WAITING_BIAS - RWSEM_ACTIVE_BIAS);
+	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_READ_BIAS);
 	return sem;
 }
 
@@ -207,7 +214,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
 	struct rwsem_waiter waiter;
 
 	waiter.flags = RWSEM_WAITING_FOR_WRITE;
-	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_BIAS);
+	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_WRITE_BIAS);
 
 	return sem;
 }
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 04/12] rwsem: consistently use adjustment variable
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (2 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 03/12] rwsem: let RWSEM_WAITING_BIAS represent any number of waiting threads Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 11:45   ` David Howells
  2010-05-12  3:20 ` [PATCH 05/12] x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics Michel Lespinasse
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

Cleanup: previous change introduced an adjustment variable used in
waking writers; the code to wake readers can be made nicer by making
use of that same variable.

Reducing variable count should hopefully help both humans and compilers
looking at this function :)

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |   15 +++++++--------
 1 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 0fb6e38..92c8f8e 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -52,7 +52,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	signed long oldcount, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -96,9 +96,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	 *   readers before waking any processes up
 	 */
  readers_only:
-	woken = 0;
+	loop = 0;
 	do {
-		woken++;
+		loop++;
 
 		if (waiter->list.next == &sem->wait_list)
 			break;
@@ -108,14 +108,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
 	} while (waiter->flags & RWSEM_WAITING_FOR_READ);
 
-	loop = woken;
-	woken *= RWSEM_ACTIVE_READ_BIAS;
+	adjustment = loop * RWSEM_ACTIVE_READ_BIAS;
 	if (waiter->flags & RWSEM_WAITING_FOR_READ)
 		/* hit end of list above */
-		woken -= RWSEM_WAITING_BIAS;
+		adjustment -= RWSEM_WAITING_BIAS;
 
  retry_readers:
-	oldcount = rwsem_atomic_update(woken, sem) - woken;
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
 	if (!downgrading && (oldcount & RWSEM_ACTIVE_MASK))
 		/* Someone grabbed the sem already */
 		goto undo_readers;
@@ -144,7 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	goto retry_writer;
 
  undo_readers:
-	if (rwsem_atomic_update(-woken, sem) & RWSEM_ACTIVE_MASK)
+	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
 		goto out;
 	goto retry_readers;
 }
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 05/12] x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (3 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 04/12] rwsem: consistently use adjustment variable Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 12:10   ` David Howells
  2010-05-12  3:20 ` [PATCH 06/12] rwsem: wake queued readers when other readers are active Michel Lespinasse
                   ` (6 subsequent siblings)
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

Now that an arbitrary number of waiting threads are represented by
a single RWSEM_WAITING_BIAS, the up_read and up_write code paths can be
slightly simplified by checking for waiting threads and no active threads
all at once.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 arch/x86/include/asm/rwsem.h |   23 ++++++++++++++---------
 arch/x86/lib/rwsem_64.S      |    4 +---
 arch/x86/lib/semaphore_32.S  |    4 +---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede1..a15b84d 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning down_read\n\t"
 		     LOCK_PREFIX _ASM_INC "(%1)\n\t"
-		     /* adds 0x00000001, returns the old value */
+		     /* adds 0x00000001 */
 		     "  jns        1f\n"
 		     "  call call_rwsem_down_read_failed\n"
 		     "1:\n\t"
@@ -160,7 +160,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 	tmp = RWSEM_ACTIVE_WRITE_BIAS;
 	asm volatile("# beginning down_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
-		     /* subtract 0x0000ffff, returns the old value */
+		     /* adds 0xffff0001, returns the old value */
 		     "  test      %1,%1\n\t"
 		     /* was the count 0 before? */
 		     "  jz        1f\n"
@@ -199,12 +199,15 @@ static inline void __up_read(struct rw_semaphore *sem)
 	asm volatile("# beginning __up_read\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 1, returns the old value */
-		     "  jns        1f\n\t"
+		     "  cmp        %4,%1\n\t"
+		     /* are there waiting threads and no active threads ? */
+		     "  jne        1f\n"
 		     "  call call_rwsem_wake\n"
 		     "1:\n"
 		     "# ending __up_read\n"
 		     : "+m" (sem->count), "=d" (tmp)
-		     : "a" (sem), "1" (tmp)
+		     : "a" (sem), "1" (tmp),
+		       "er" (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_READ_BIAS)
 		     : "memory", "cc");
 }
 
@@ -213,17 +216,19 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp;
+	rwsem_count_t tmp = -RWSEM_ACTIVE_WRITE_BIAS;
 	asm volatile("# beginning __up_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
-		     /* tries to transition
-			0xffff0001 -> 0x00000000 */
-		     "  jz       1f\n"
+		     /* substracts 0xffff0001, returns the old value */
+		     "  cmp      %4,%1\n\t"
+		     /* are there waiting threads and no active threads ? */
+		     "  jne      1f\n"
 		     "  call call_rwsem_wake\n"
 		     "1:\n\t"
 		     "# ending __up_write\n"
 		     : "+m" (sem->count), "=d" (tmp)
-		     : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS)
+		     : "a" (sem), "1" (tmp),
+		       "er" (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS)
 		     : "memory", "cc");
 }
 
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00..770a387 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -60,13 +60,11 @@ ENTRY(call_rwsem_down_write_failed)
 	ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
-	decl %edx	/* do nothing if still outstanding active readers */
-	jnz 1f
 	save_common_regs
 	movq %rax,%rdi
 	call rwsem_wake
 	restore_common_regs
-1:	ret
+	ret
 	ENDPROC(call_rwsem_wake)
 
 /* Fix up special calling conventions */
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe47..63dbf75 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -103,15 +103,13 @@ ENTRY(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
 	CFI_STARTPROC
-	decw %dx    /* do nothing if still outstanding active readers */
-	jnz 1f
 	push %ecx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ecx,0
 	call rwsem_wake
 	pop %ecx
 	CFI_ADJUST_CFA_OFFSET -4
-1:	ret
+	ret
 	CFI_ENDPROC
 	ENDPROC(call_rwsem_wake)
 
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 06/12] rwsem: wake queued readers when other readers are active
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (4 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 05/12] x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 12:22   ` David Howells
  2010-05-12  3:20 ` [PATCH 07/12] rwsem: wake queued readers when writer blocks on active read lock Michel Lespinasse
                   ` (5 subsequent siblings)
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

This change addresses the following situation:

- Thread A holds the rwsem for write
- Thread B tries to acquire the rwsem for read; blocks & gets queued
- Thread A releases the rwsem, notices active count goes back to 0
- Thread C acquires the rwsem for read
- Thread A acquires the spinlock & tries to wake thread B, but fails because
  active count is not zero anymore.

In this situation, it would be perfectly fine to let threads B and C work
in parallel as they each only want a read acquire on the rwsem. We can
recognize this situation and let A wake B as long as there are no active
writers on the rwsem.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 92c8f8e..9d0899b 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -115,8 +115,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
  retry_readers:
 	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (!downgrading && (oldcount & RWSEM_ACTIVE_MASK))
-		/* Someone grabbed the sem already */
+	if (!downgrading && (oldcount < RWSEM_WAITING_BIAS))
+		/* Someone grabbed the sem for write already */
 		goto undo_readers;
 
 	next = sem->wait_list.next;
@@ -143,7 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	goto retry_writer;
 
  undo_readers:
-	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
+	if (rwsem_atomic_update(-adjustment, sem) < RWSEM_WAITING_BIAS)
 		goto out;
 	goto retry_readers;
 }
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 07/12] rwsem: wake queued readers when writer blocks on active read lock
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (5 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 06/12] rwsem: wake queued readers when other readers are active Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 12:33   ` David Howells
  2010-05-12  3:20 ` [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common Michel Lespinasse
                   ` (4 subsequent siblings)
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

This change addresses the following situation:

- Thread A acquires the rwsem for read
- Thread B tries to acquire the rwsem for write, notices there is already
  an active owner for the rwsem.
- Thread C tries to acquire the rwsem for read, notices that thread B already
  tried to acquire it.
- Thread C grabs the spinlock and queues itself on the wait queue.
- Thread B grabs the spinlock and queues itself behind C. At this point A is
  the only remaining active owner on the rwsem.

In this situation thread B could notice that it was the last active writer
on the rwsem, and decide to wake C to let it proceed in parallel with A
since they both only want the rwsem for read.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |   32 ++++++++++++++++++++++----------
 1 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 9d0899b..84bbc55 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -36,6 +36,10 @@ struct rwsem_waiter {
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
 };
 
+#define RWSEM_WAKE_ANY        0 /* Wake whatever's at head of wait list */
+#define RWSEM_WAKE_READERS    1 /* Sem is read owned by other thread */
+#define RWSEM_WAKE_READ_OWNED 2 /* Sem is read owned by caller thread */
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then:
@@ -46,8 +50,8 @@ struct rwsem_waiter {
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if downgrading is false
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
+static struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -58,9 +62,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
 		goto readers_only;
 
-	if (downgrading)
-		/* Caller's lock is still active, so we can't possibly
-		 * succeed waking writers.
+	if (wake_type != RWSEM_WAKE_ANY)
+		/* Another active reader was observed, so wakeup is not
+		 * likely to succeed. Save the atomic op.
 		 */
 		goto out;
 
@@ -115,7 +119,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
 
  retry_readers:
 	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (!downgrading && (oldcount < RWSEM_WAITING_BIAS))
+	if (wake_type != RWSEM_WAKE_READ_OWNED &&
+	    oldcount < RWSEM_WAITING_BIAS)
 		/* Someone grabbed the sem for write already */
 		goto undo_readers;
 
@@ -172,9 +177,16 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	/* we're now waiting on the lock, but no longer actively read-locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* if there are no active locks, wake the front queued process(es) up */
+	/* if there are no active locks, wake the front queued process(es) up.
+	 *
+	 * or if we're called from a failed down_write(), and there were
+	 * already threads queued before us, and there are no active writers,
+	 * the lock must be read owned; try to wake any read locks that were
+	 * queued ahead of us. */
 	if (!(count & RWSEM_ACTIVE_MASK))
-		sem = __rwsem_do_wake(sem, 0);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+	else if (adjustment > 0 && count > RWSEM_WAITING_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
 
 	spin_unlock_irq(&sem->wait_lock);
 
@@ -230,7 +242,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, 0);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
@@ -250,7 +262,7 @@ asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, 1);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (6 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 07/12] rwsem: wake queued readers when writer blocks on active read lock Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 12:36   ` David Howells
  2010-05-12 12:42   ` David Howells
  2010-05-12  3:20 ` [PATCH 09/12] generic rwsem: implement down_read_unfair Michel Lespinasse
                   ` (3 subsequent siblings)
  11 siblings, 2 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

More code can be pushed from rwsem_down_read_failed and
rwsem_down_write_failed into rwsem_down_failed_common.

Following change also enjoys having flags available in a register rather
than having to fish it out in the struct rwsem_waiter...

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |   23 +++++++++--------------
 1 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 84bbc55..1702524 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -158,8 +158,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
  */
 static struct rw_semaphore __sched *
 rwsem_down_failed_common(struct rw_semaphore *sem,
-			struct rwsem_waiter *waiter, signed long adjustment)
+			 unsigned int flags, signed long adjustment)
 {
+	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
 	signed long count;
 
@@ -167,7 +168,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 
 	/* set up my own style of waitqueue */
 	spin_lock_irq(&sem->wait_lock);
-	waiter->task = tsk;
+	waiter.task = tsk;
+	waiter.flags = flags;
 	get_task_struct(tsk);
 
 	if (list_empty(&sem->wait_list))
@@ -192,7 +194,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 
 	/* wait to be given the lock */
 	for (;;) {
-		if (!waiter->task)
+		if (!waiter.task)
 			break;
 		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
@@ -209,11 +211,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 asmregparm struct rw_semaphore __sched *
 rwsem_down_read_failed(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter waiter;
-
-	waiter.flags = RWSEM_WAITING_FOR_READ;
-	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_READ_BIAS);
-	return sem;
+	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
+					-RWSEM_ACTIVE_READ_BIAS);
 }
 
 /*
@@ -222,12 +221,8 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 asmregparm struct rw_semaphore __sched *
 rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter waiter;
-
-	waiter.flags = RWSEM_WAITING_FOR_WRITE;
-	rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_WRITE_BIAS);
-
-	return sem;
+	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
+					-RWSEM_ACTIVE_WRITE_BIAS);
 }
 
 /*
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 09/12] generic rwsem: implement down_read_unfair
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (7 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common Michel Lespinasse
@ 2010-05-12  3:20 ` Michel Lespinasse
  2010-05-12 12:46   ` David Howells
  2010-05-12  3:21 ` [PATCH 10/12] rwsem: down_read_unfair infrastructure support Michel Lespinasse
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:20 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

Add down_read_unfair() API.

This is similar to down_read() with the following changes:
- when the rwsem is read owned with queued writers, down_read_unfair() callers
  are allowed to acquire the rwsem for read without queueing;
- when the rwsem is write owned, down_read_unfair() callers get queued in
  front of threads trying to acquire the rwsem by other means.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 include/linux/rwsem-spinlock.h |   10 +++++++++-
 include/linux/rwsem.h          |   10 ++++++++++
 kernel/rwsem.c                 |   17 +++++++++++++++++
 lib/rwsem-spinlock.c           |   10 +++++++---
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index bdfcc25..dc849d9 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -60,7 +60,15 @@ do {								\
 	__init_rwsem((sem), #sem, &__key);			\
 } while (0)
 
-extern void __down_read(struct rw_semaphore *sem);
+#define __HAVE_DOWN_READ_UNFAIR
+
+static inline void __down_read(struct rw_semaphore *sem)
+	{ __down_read_internal(sem, 0); }
+
+static inline void __sched __down_read_unfair(struct rw_semaphore *sem)
+	{ __down_read_internal(sem, 1);	}
+
+extern void __down_read_internal(struct rw_semaphore *sem, int unfair);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
 extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index efd348f..365e47b 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -28,6 +28,16 @@ struct rw_semaphore;
 extern void down_read(struct rw_semaphore *sem);
 
 /*
+ * lock for reading - skip waiting writers
+ */
+#ifdef __HAVE_DOWN_READ_UNFAIR
+extern void down_read_unfair(struct rw_semaphore *sem);
+#else
+static inline void down_read_unfair(struct rw_semaphore *sem)
+	{ down_read(sem); }
+#endif
+
+/*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 extern int down_read_trylock(struct rw_semaphore *sem);
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b..0cef726 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -26,6 +26,23 @@ void __sched down_read(struct rw_semaphore *sem)
 
 EXPORT_SYMBOL(down_read);
 
+#ifdef __HAVE_DOWN_READ_UNFAIR
+
+/*
+ * lock for reading - skip waiting writers
+ */
+void __sched down_read_unfair(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	LOCK_CONTENDED(sem, __down_read_trylock, __down_read_unfair);
+}
+
+EXPORT_SYMBOL(down_read_unfair);
+
+#endif
+
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index ffc9fc7..b2fd5fb 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -139,7 +139,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 /*
  * get a read lock on the semaphore
  */
-void __sched __down_read(struct rw_semaphore *sem)
+void __sched __down_read_internal(struct rw_semaphore *sem, int unfair)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -147,7 +147,7 @@ void __sched __down_read(struct rw_semaphore *sem)
 
 	spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+	if (sem->activity >= 0 && (unfair || list_empty(&sem->wait_list))) {
 		/* granted */
 		sem->activity++;
 		spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -162,7 +162,11 @@ void __sched __down_read(struct rw_semaphore *sem)
 	waiter.flags = RWSEM_WAITING_FOR_READ;
 	get_task_struct(tsk);
 
-	list_add_tail(&waiter.list, &sem->wait_list);
+	if (unfair) {
+		list_add(&waiter.list, &sem->wait_list);
+	} else {
+		list_add_tail(&waiter.list, &sem->wait_list);
+	}
 
 	/* we don't need to touch the semaphore struct anymore */
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 10/12] rwsem: down_read_unfair infrastructure support
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (8 preceding siblings ...)
  2010-05-12  3:20 ` [PATCH 09/12] generic rwsem: implement down_read_unfair Michel Lespinasse
@ 2010-05-12  3:21 ` Michel Lespinasse
  2010-05-12  3:21 ` [PATCH 11/12] x86 rwsem: down_read_unfair implementation Michel Lespinasse
  2010-05-12  3:21 ` [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files Michel Lespinasse
  11 siblings, 0 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:21 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

Add rwsem_down_read_unfair_failed() function in the non-generic rwsem
library code, similar to  rwsem_down_read_failed() except that blocked
threads are placed at the head of the queue.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 lib/rwsem.c |   22 +++++++++++++++++++++-
 1 files changed, 21 insertions(+), 1 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 1702524..689d4a2 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -34,6 +34,7 @@ struct rwsem_waiter {
 	unsigned int flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
+#define RWSEM_UNFAIR            0x00000004
 };
 
 #define RWSEM_WAKE_ANY        0 /* Wake whatever's at head of wait list */
@@ -174,7 +175,11 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 
 	if (list_empty(&sem->wait_list))
 		adjustment += RWSEM_WAITING_BIAS;
-	list_add_tail(&waiter->list, &sem->wait_list);
+
+	if (flags & RWSEM_UNFAIR)
+		list_add(&waiter.list, &sem->wait_list);
+	else
+		list_add_tail(&waiter.list, &sem->wait_list);
 
 	/* we're now waiting on the lock, but no longer actively read-locking */
 	count = rwsem_atomic_update(adjustment, sem);
@@ -215,6 +220,21 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 					-RWSEM_ACTIVE_READ_BIAS);
 }
 
+#ifdef __HAVE_DOWN_READ_UNFAIR
+
+/*
+ * wait for the read lock to be granted - skip waiting writers
+ */
+asmregparm struct rw_semaphore __sched *
+rwsem_down_read_unfair_failed(struct rw_semaphore *sem)
+{
+	return rwsem_down_failed_common(sem,
+					RWSEM_WAITING_FOR_READ | RWSEM_UNFAIR,
+					-RWSEM_ACTIVE_READ_BIAS);
+}
+
+#endif
+
 /*
  * wait for the write lock to be granted
  */
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 11/12] x86 rwsem: down_read_unfair implementation
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (9 preceding siblings ...)
  2010-05-12  3:21 ` [PATCH 10/12] rwsem: down_read_unfair infrastructure support Michel Lespinasse
@ 2010-05-12  3:21 ` Michel Lespinasse
  2010-05-12 13:08   ` David Howells
  2010-05-12  3:21 ` [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files Michel Lespinasse
  11 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:21 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

RWSEM_ACTIVE_WRITE_BIAS is set to be 'more negative' than RWSEM_WAITING_BIAS
so that down_read_unfair() can check for active writers by comparing
the rwsem count against RWSEM_WAITING_BIAS.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 arch/x86/include/asm/rwsem.h |   43 ++++++++++++++++++++++++++++++++++-------
 arch/x86/lib/rwsem_64.S      |   10 +++++++++
 arch/x86/lib/semaphore_32.S  |   17 ++++++++++++++++
 3 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a15b84d..0b3a924 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -16,11 +16,10 @@
  * if there are writers (and maybe) readers waiting (in which case it goes to
  * sleep).
  *
- * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
- * be extended to 65534 by manually checking the whole MSW rather than relying
- * on the S flag.
+ * The WRITE_BIAS value supports up to 32767 processes simultaneously
+ * trying to acquire a write lock.
  *
- * The value of ACTIVE_BIAS supports up to 65535 active processes.
+ * The value of ACTIVE_BIAS supports up to 32767 active processes.
  *
  * This should be totally fair - if anything is waiting, a process that wants a
  * lock will go to the back of the queue. When the currently active lock is
@@ -48,6 +47,8 @@ struct rwsem_waiter;
 extern asmregparm struct rw_semaphore *
  rwsem_down_read_failed(struct rw_semaphore *sem);
 extern asmregparm struct rw_semaphore *
+ rwsem_down_read_unfair_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
  rwsem_down_write_failed(struct rw_semaphore *sem);
 extern asmregparm struct rw_semaphore *
  rwsem_wake(struct rw_semaphore *);
@@ -63,16 +64,19 @@ extern asmregparm struct rw_semaphore *
  */
 
 #ifdef CONFIG_X86_64
-# define RWSEM_ACTIVE_MASK		0xffffffffL
+# define RWSEM_ACTIVE_MASK		0x7fffffffL
 #else
-# define RWSEM_ACTIVE_MASK		0x0000ffffL
+# define RWSEM_ACTIVE_MASK		0x00007fffL
 #endif
 
 #define RWSEM_UNLOCKED_VALUE		0x00000000L
 #define RWSEM_ACTIVE_BIAS		0x00000001L
 #define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+#define RWSEM_ACTIVE_WRITE_BIAS		(2 * RWSEM_WAITING_BIAS + \
+					 RWSEM_ACTIVE_BIAS)
+
+#define __HAVE_DOWN_READ_UNFAIR
 
 typedef signed long rwsem_count_t;
 
@@ -129,6 +133,28 @@ static inline void __down_read(struct rw_semaphore *sem)
 }
 
 /*
+ * lock for reading - skip waiting writers
+ */
+static inline void __down_read_unfair(struct rw_semaphore *sem)
+{
+	rwsem_count_t tmp;
+
+	tmp = RWSEM_ACTIVE_READ_BIAS;
+	asm volatile("# beginning down_read_unfair\n\t"
+		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
+		     /* adds 0x00000001, returns the old value */
+		     "  cmp       %4,%1\n\t"
+		     /* was the count >= RWSEM_WAITING_BIAS before? */
+		     "  jge       1f\n"
+		     "  call call_rwsem_down_read_unfair_failed\n"
+		     "1:\n"
+		     "# ending down_read_unfair"
+		     : "+m" (sem->count), "=r" (tmp)
+		     : "a" (sem), "1" (tmp), "re" (RWSEM_WAITING_BIAS)
+		     : "memory", "cc");
+}
+
+/*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 static inline int __down_read_trylock(struct rw_semaphore *sem)
@@ -248,7 +274,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 		     "1:\n\t"
 		     "# ending __downgrade_write\n"
 		     : "+m" (sem->count)
-		     : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
+		     : "a" (sem),
+		       "er" (RWSEM_ACTIVE_READ_BIAS - RWSEM_ACTIVE_WRITE_BIAS)
 		     : "memory", "cc");
 }
 
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 770a387..328ef64 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -51,6 +51,16 @@ ENTRY(call_rwsem_down_read_failed)
 	ret
 	ENDPROC(call_rwsem_down_read_failed)
 
+ENTRY(call_rwsem_down_read_unfair_failed)
+	save_common_regs
+	pushq %rdx
+	movq %rax,%rdi
+	call rwsem_down_read_unfair_failed
+	popq %rdx
+	restore_common_regs
+	ret
+	ENDPROC(call_rwsem_down_read_failed)
+
 ENTRY(call_rwsem_down_write_failed)
 	save_common_regs
 	movq %rax,%rdi
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 63dbf75..115d2ad 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -89,6 +89,23 @@ ENTRY(call_rwsem_down_read_failed)
 	CFI_ENDPROC
 	ENDPROC(call_rwsem_down_read_failed)
 
+ENTRY(call_rwsem_down_read_unfair_failed)
+	CFI_STARTPROC
+	push %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	call rwsem_down_read_unfair_failed
+	pop %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	pop %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	ret
+	CFI_ENDPROC
+	ENDPROC(call_rwsem_down_read_failed)
+
 ENTRY(call_rwsem_down_write_failed)
 	CFI_STARTPROC
 	push %ecx
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files
  2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
                   ` (10 preceding siblings ...)
  2010-05-12  3:21 ` [PATCH 11/12] x86 rwsem: down_read_unfair implementation Michel Lespinasse
@ 2010-05-12  3:21 ` Michel Lespinasse
  2010-05-12 13:10   ` David Howells
  2010-05-12 22:53   ` KOSAKI Motohiro
  11 siblings, 2 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12  3:21 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner
  Cc: LKML, Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han,
	Michel Lespinasse

This helps in the following situation:
- Thread A takes a page fault while reading or writing memory.
  do_page_fault() acquires the mmap_sem for read and blocks on disk
  (either reading the page from file, or hitting swap) for a long time.
- Thread B does an mmap call and blocks trying to acquire the mmap_sem
  for write
- Thread C is a monitoring process trying to read every /proc/pid/maps
  in the system. This requires acquiring the mmap_sem for read. Thread C
  blocks behind B, waiting for A to release the rwsem.  If thread C
  could be allowed to run in parallel with A, it would probably get done
  long before thread A's disk access completes, thus not actually slowing
  down thread B.

Test results with down_read_unfair_test (10 seconds):

2.6.33.3:
threadA completes ~600 faults
threadB completes ~300 mmap/munmap cycles
threadC completes ~600 /proc/pid/maps reads

2.6.33.3 + down_read_unfair:
threadA completes ~600 faults
threadB completes ~300 mmap/munmap cycles
threadC completes ~160000 /proc/pid/maps reads

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 fs/proc/base.c       |    2 +-
 fs/proc/task_mmu.c   |    2 +-
 fs/proc/task_nommu.c |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8418fcc..9132488 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1367,7 +1367,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 
 	/* We need mmap_sem to protect against races with removal of
 	 * VM_EXECUTABLE vmas */
-	down_read(&mm->mmap_sem);
+	down_read_unfair(&mm->mmap_sem);
 	exe_file = mm->exe_file;
 	if (exe_file)
 		get_file(exe_file);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0705534..09647ad 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -123,7 +123,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
-	down_read(&mm->mmap_sem);
+	down_read_unfair(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
 	priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 46d4b5d..56ca830 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -194,7 +194,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		priv->task = NULL;
 		return NULL;
 	}
-	down_read(&mm->mmap_sem);
+	down_read_unfair(&mm->mmap_sem);
 
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code
  2010-05-12  3:20 ` [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code Michel Lespinasse
@ 2010-05-12 10:39   ` David Howells
  0 siblings, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 10:39 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> If there are no active threasd using a semaphore, it is always correct to
> unqueue blocked threads. This seems to be what was intended in the undo code.
> 
> What was done instead, was to look for a sem count of zero - this is an
> impossible situation, given that at least one thread is known to be queued
> on the semaphore. The code might be correct as written, but it's hard to
> reason about and it's not what was intended (otherwise the goto out would
> have been unconditional).
> 
> Go for checking the active count - the alternative is not worth the headache.

I think this is a definite bug fix, so I've sent it upstream in advance.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers.
  2010-05-12  3:20 ` [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers Michel Lespinasse
@ 2010-05-12 11:01   ` David Howells
  2010-05-13  0:54     ` Michel Lespinasse
  2010-05-12 11:36   ` David Howells
  1 sibling, 1 reply; 33+ messages in thread
From: David Howells @ 2010-05-12 11:01 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> - *   - there must be someone on the queue
> + * - there must be someone on the queue

Why did you change this comment?  This is still a guarantee up_xxxx() must
make about the state of the rwsem.

> +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
> +	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
> +		goto readers_only;
> +
>  	if (downgrading)
> -		goto dont_wake_writers;
> +		/* Caller's lock is still active, so we can't possibly
> +		 * succeed waking writers.
> +		 */
> +		goto out;

It's a nice idea to do it this way round - it puts the wake-up-reader path
first and puts the downgrader on the slower path.

> -	/* if we came through an up_xxxx() call, we only only wake someone up
> +	/* There's a writer at the front of the queue - try to grant it the
> +	 * write lock. However, we only only wake someone up
>  	 * if we can transition the active part of the count from 0 -> 1
>  	 */

Two spaces after a full stop, please, and can you please adjust the comment so
that it fills out to 80 chars.  E.g:

	/* There's a writer at the front of the queue - try to grant it the
	 * write lock.  However, we only only wake someone up if we can
	 * transition the active part of the count from 0 -> 1
	 */

instead of:

	/* There's a writer at the front of the queue - try to grant it the
	 * write lock. However, we only only wake someone up
	 * if we can transition the active part of the count from 0 -> 1
	 */

> + retry_readers:
> +	oldcount = rwsem_atomic_update(woken, sem) - woken;
> +	if (!downgrading && (oldcount & RWSEM_ACTIVE_MASK))

The problem with doing this here is that you may just have wasted all the work
you did working out what woken is going to be.  That may have been quite slow
as the CPU may have had to get hold of a bunch of cachelines that weren't in
its cache.  Furthermore, you are doing this under a spinlock, so you may have
lost your right to wake anyone up, and you'll be blocking the CPU that will be
allowed to perform the wakeup.

Incrementing the count first nets you a guarantee that you have the right to
wake things up.

You may point out that if there's no contention, then what your revised code
does doesn't slow anything down.  That's true, but on modern CPU's, neither
does the old code as the exclusively held cache line will lurk in the CPU's
cache until there's contention on it.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers.
  2010-05-12  3:20 ` [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers Michel Lespinasse
  2010-05-12 11:01   ` David Howells
@ 2010-05-12 11:36   ` David Howells
  1 sibling, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 11:36 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> +	 * write lock. However, we only only wake someone up

Can you get rid of the excess 'only' whilst you're at it?

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 04/12] rwsem: consistently use adjustment variable
  2010-05-12  3:20 ` [PATCH 04/12] rwsem: consistently use adjustment variable Michel Lespinasse
@ 2010-05-12 11:45   ` David Howells
  2010-05-13  1:12     ` Michel Lespinasse
  0 siblings, 1 reply; 33+ messages in thread
From: David Howells @ 2010-05-12 11:45 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> Cleanup: previous change introduced an adjustment variable used in
> waking writers; the code to wake readers can be made nicer by making
> use of that same variable.
> 
> Reducing variable count should hopefully help both humans and compilers
> looking at this function :)

Actually, no.  The compiler doesn't care.  With optimisation, the variable
'woken' would cease to exist after being copied to loop and added into
adjustment.  What it would probably do is just relabel 'woken' internally as
'loop'.

>From a human point of view, I would say keep woken, but use it to initialise
adjustment:

	adjustment = woken * RWSEM_ACTIVE_READ_BIAS;

and use it to initialise 'loop':

	for (loop = woken; loop > 0; loop--) {

or just ditch 'loop' and use 'woken' directly, since the number of processes
to be woken goes down as the loop iterates.

Whatever, the compiler won't care, since loop and woken will just be names for
the same value.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 05/12] x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics
  2010-05-12  3:20 ` [PATCH 05/12] x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics Michel Lespinasse
@ 2010-05-12 12:10   ` David Howells
  0 siblings, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 12:10 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

>  static inline void __up_write(struct rw_semaphore *sem)
>  {
> ...
>  		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
> -		     /* tries to transition
> -			0xffff0001 -> 0x00000000 */
> -		     "  jz       1f\n"
> +		     /* substracts 0xffff0001, returns the old value */
> +		     "  cmp      %4,%1\n\t"
> +		     /* are there waiting threads and no active threads ? */
> +		     "  jne      1f\n"
>  		     "  call call_rwsem_wake\n"

It looks like you're betting on there being fast-path contention.  If the
common case is no fast-path contention, then you'd be better off putting the
comparison out of line in the medium-path and retaining the JZ instruction.

The same goes for __up_read(): you could retain the JNS there and put the
comparison out of line into the medium-path.

Doing this also saves you some code space.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 06/12] rwsem: wake queued readers when other readers are active
  2010-05-12  3:20 ` [PATCH 06/12] rwsem: wake queued readers when other readers are active Michel Lespinasse
@ 2010-05-12 12:22   ` David Howells
  2010-05-13  2:39     ` Michel Lespinasse
  0 siblings, 1 reply; 33+ messages in thread
From: David Howells @ 2010-05-12 12:22 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> In this situation, it would be perfectly fine to let threads B and C work
> in parallel as they each only want a read acquire on the rwsem. We can
> recognize this situation and let A wake B as long as there are no active
> writers on the rwsem.

There can't be any active writers on the rwsem.  An active writer must have
just been upped and is in the process of waking the first sleeper up.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 07/12] rwsem: wake queued readers when writer blocks on active read lock
  2010-05-12  3:20 ` [PATCH 07/12] rwsem: wake queued readers when writer blocks on active read lock Michel Lespinasse
@ 2010-05-12 12:33   ` David Howells
  0 siblings, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 12:33 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> +	/* if there are no active locks, wake the front queued process(es) up.
> +	 *
> +	 * or if we're called from a failed down_write(), and there were
> +	 * already threads queued before us, and there are no active writers,
> +	 * the lock must be read owned; try to wake any read locks that were
> +	 * queued ahead of us. */

That looks weird.  Can I suggest rewriting it thus:

	/* If there are no active locks, wake the front queued process(es) up.
	 *
	 * Alternatively, if we're called from a failed down_write(), there
	 * were already threads queued before us and there are no active
	 * writers, the lock must be read owned; so we try to wake any read
	 * locks that were queued ahead of us. */

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common
  2010-05-12  3:20 ` [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common Michel Lespinasse
@ 2010-05-12 12:36   ` David Howells
  2010-05-12 12:42   ` David Howells
  1 sibling, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 12:36 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han


Have you tried compiling this patch?

lib/rwsem.c: In function 'rwsem_down_failed_common':
lib/rwsem.c:177: error: invalid type argument of '->' (have 'struct rwsem_waiter')

There's a list_add_tail() that refers to waiter->list

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common
  2010-05-12  3:20 ` [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common Michel Lespinasse
  2010-05-12 12:36   ` David Howells
@ 2010-05-12 12:42   ` David Howells
  2010-05-13  2:54     ` Michel Lespinasse
  1 sibling, 1 reply; 33+ messages in thread
From: David Howells @ 2010-05-12 12:42 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> Following change also enjoys having flags available in a register rather
> than having to fish it out in the struct rwsem_waiter...

So what?  It doesn't use flags in rwsem_down_failed_common().

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 09/12] generic rwsem: implement down_read_unfair
  2010-05-12  3:20 ` [PATCH 09/12] generic rwsem: implement down_read_unfair Michel Lespinasse
@ 2010-05-12 12:46   ` David Howells
  0 siblings, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 12:46 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

>  /*
> + * lock for reading - skip waiting writers
> + */

They may also skip waiting readers, if such are queued behind a writer.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 11/12] x86 rwsem: down_read_unfair implementation
  2010-05-12  3:21 ` [PATCH 11/12] x86 rwsem: down_read_unfair implementation Michel Lespinasse
@ 2010-05-12 13:08   ` David Howells
  0 siblings, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 13:08 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

>  #define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1)
>  #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
> +#define RWSEM_ACTIVE_WRITE_BIAS		(2 * RWSEM_WAITING_BIAS + \
> +					 RWSEM_ACTIVE_BIAS)

These are getting a bit meaningless.  Can you replace them with hex numbers to
make it clearer what's going on?  Putting a worked example in the patch
description would be good too.

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files
  2010-05-12  3:21 ` [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files Michel Lespinasse
@ 2010-05-12 13:10   ` David Howells
  2010-05-12 22:53   ` KOSAKI Motohiro
  1 sibling, 0 replies; 33+ messages in thread
From: David Howells @ 2010-05-12 13:10 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: dhowells, Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

Michel Lespinasse <walken@google.com> wrote:

> - Thread C is a monitoring process trying to read every /proc/pid/maps
>   in the system. This requires acquiring the mmap_sem for read. Thread C
>   blocks behind B, waiting for A to release the rwsem.  If thread C
>   could be allowed to run in parallel with A, it would probably get done
>   long before thread A's disk access completes, thus not actually slowing
>   down thread B.

Is it possible for someone to execute a DoS attack on another process using
a bunch of threads reading /proc/pid/maps?

David

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files
  2010-05-12  3:21 ` [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files Michel Lespinasse
  2010-05-12 13:10   ` David Howells
@ 2010-05-12 22:53   ` KOSAKI Motohiro
  2010-05-12 23:35     ` Michel Lespinasse
  1 sibling, 1 reply; 33+ messages in thread
From: KOSAKI Motohiro @ 2010-05-12 22:53 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: kosaki.motohiro, Linus Torvalds, David Howells, Ingo Molnar,
	Thomas Gleixner, LKML, Andrew Morton, Mike Waychison,
	Suleiman Souhlal, Ying Han

> This helps in the following situation:
> - Thread A takes a page fault while reading or writing memory.
>   do_page_fault() acquires the mmap_sem for read and blocks on disk
>   (either reading the page from file, or hitting swap) for a long time.
> - Thread B does an mmap call and blocks trying to acquire the mmap_sem
>   for write
> - Thread C is a monitoring process trying to read every /proc/pid/maps
>   in the system. This requires acquiring the mmap_sem for read. Thread C
>   blocks behind B, waiting for A to release the rwsem.  If thread C
>   could be allowed to run in parallel with A, it would probably get done
>   long before thread A's disk access completes, thus not actually slowing
>   down thread B.
> 
> Test results with down_read_unfair_test (10 seconds):
> 
> 2.6.33.3:
> threadA completes ~600 faults
> threadB completes ~300 mmap/munmap cycles
> threadC completes ~600 /proc/pid/maps reads
> 
> 2.6.33.3 + down_read_unfair:
> threadA completes ~600 faults
> threadB completes ~300 mmap/munmap cycles
> threadC completes ~160000 /proc/pid/maps reads
> 
> Signed-off-by: Michel Lespinasse <walken@google.com>

Is it good idea?
So I think /proc shouldn't use unfair thing as backdoor. 
It doesn't only makes performance improvement, but also 
DoS chance is there.

Please use this feature only in internal.



> ---
>  fs/proc/base.c       |    2 +-
>  fs/proc/task_mmu.c   |    2 +-
>  fs/proc/task_nommu.c |    2 +-
>  3 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 8418fcc..9132488 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1367,7 +1367,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
>  
>  	/* We need mmap_sem to protect against races with removal of
>  	 * VM_EXECUTABLE vmas */
> -	down_read(&mm->mmap_sem);
> +	down_read_unfair(&mm->mmap_sem);
>  	exe_file = mm->exe_file;
>  	if (exe_file)
>  		get_file(exe_file);
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 0705534..09647ad 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -123,7 +123,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
>  	mm = mm_for_maps(priv->task);
>  	if (!mm)
>  		return NULL;
> -	down_read(&mm->mmap_sem);
> +	down_read_unfair(&mm->mmap_sem);
>  
>  	tail_vma = get_gate_vma(priv->task);
>  	priv->tail_vma = tail_vma;
> diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
> index 46d4b5d..56ca830 100644
> --- a/fs/proc/task_nommu.c
> +++ b/fs/proc/task_nommu.c
> @@ -194,7 +194,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
>  		priv->task = NULL;
>  		return NULL;
>  	}
> -	down_read(&mm->mmap_sem);
> +	down_read_unfair(&mm->mmap_sem);
>  
>  	/* start from the Nth VMA */
>  	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
> -- 
> 1.7.0.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/




^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and  /sys/<pid>/maps files
  2010-05-12 22:53   ` KOSAKI Motohiro
@ 2010-05-12 23:35     ` Michel Lespinasse
  2010-05-13  0:32       ` KOSAKI Motohiro
  0 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-12 23:35 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Linus Torvalds, David Howells, Ingo Molnar, Thomas Gleixner, LKML,
	Andrew Morton, Mike Waychison, Suleiman Souhlal, Ying Han

On Wed, May 12, 2010 at 3:53 PM, KOSAKI Motohiro
<kosaki.motohiro@jp.fujitsu.com> wrote:
>> This helps in the following situation:
>> - Thread A takes a page fault while reading or writing memory.
>>   do_page_fault() acquires the mmap_sem for read and blocks on disk
>>   (either reading the page from file, or hitting swap) for a long time.
>> - Thread B does an mmap call and blocks trying to acquire the mmap_sem
>>   for write
>> - Thread C is a monitoring process trying to read every /proc/pid/maps
>>   in the system. This requires acquiring the mmap_sem for read. Thread C
>>   blocks behind B, waiting for A to release the rwsem.  If thread C
>>   could be allowed to run in parallel with A, it would probably get done
>>   long before thread A's disk access completes, thus not actually slowing
>>   down thread B.
>>
>> Test results with down_read_unfair_test (10 seconds):
>>
>> 2.6.33.3:
>> threadA completes ~600 faults
>> threadB completes ~300 mmap/munmap cycles
>> threadC completes ~600 /proc/pid/maps reads
>>
>> 2.6.33.3 + down_read_unfair:
>> threadA completes ~600 faults
>> threadB completes ~300 mmap/munmap cycles
>> threadC completes ~160000 /proc/pid/maps reads
>>
>> Signed-off-by: Michel Lespinasse <walken@google.com>
>
> Is it good idea?
> So I think /proc shouldn't use unfair thing as backdoor.
> It doesn't only makes performance improvement, but also
> DoS chance is there.

I am not entirely surprised that there is some level of opposition to
this change (which is in part why it went last in the series).

Besides keeping it internal, would there be ways to make it acceptable
to the community ? For example, would it be fine if unfair behavior
was only used if the caller thread runs with root priviledge ?

In my opinion the optimal behavior would be if the rwsem could be
allowed to be grabbed unfairly only as long as there are still fair
readers on it. However, I don't see how to achieve this given that we
don't want to slow down the regular, fair code paths.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and  /sys/<pid>/maps files
  2010-05-12 23:35     ` Michel Lespinasse
@ 2010-05-13  0:32       ` KOSAKI Motohiro
  0 siblings, 0 replies; 33+ messages in thread
From: KOSAKI Motohiro @ 2010-05-13  0:32 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: kosaki.motohiro, Linus Torvalds, David Howells, Ingo Molnar,
	Thomas Gleixner, LKML, Andrew Morton, Mike Waychison,
	Suleiman Souhlal, Ying Han

> > Is it good idea?
> > So I think /proc shouldn't use unfair thing as backdoor.
> > It doesn't only makes performance improvement, but also
> > DoS chance is there.
> 
> I am not entirely surprised that there is some level of opposition to
> this change (which is in part why it went last in the series).
> 
> Besides keeping it internal, would there be ways to make it acceptable
> to the community ? For example, would it be fine if unfair behavior
> was only used if the caller thread runs with root priviledge ?

Umm. seems no good idea.

Why?

In nowadays, root priviledge should be considered to blocked by security module.
but this using way can't combinate with any security module. at least we need 
more proper and security friendly interface.

> In my opinion the optimal behavior would be if the rwsem could be
> allowed to be grabbed unfairly only as long as there are still fair
> readers on it. However, I don't see how to achieve this given that we
> don't want to slow down the regular, fair code paths.

dumb question.
Why do you need to read /proc/<pid>/exec and /proc/<pid>/maps?
To make new /proc files makes help?

IOW, do we really need unfair reader? can't we make more fine grained lock?




^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 02/12] rwsem: use single atomic update for sem count when  waking up readers.
  2010-05-12 11:01   ` David Howells
@ 2010-05-13  0:54     ` Michel Lespinasse
  0 siblings, 0 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-13  0:54 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML, Andrew Morton,
	Mike Waychison, Suleiman Souhlal, Ying Han

On Wed, May 12, 2010 at 4:01 AM, David Howells <dhowells@redhat.com> wrote:
> Michel Lespinasse <walken@google.com> wrote:
>
>> - *   - there must be someone on the queue
>> + * - there must be someone on the queue
>
> Why did you change this comment?  This is still a guarantee up_xxxx() must
> make about the state of the rwsem.

What I meant is that we do rely on there being someone on the queue
even if not coming from up_xxxx().

>> + retry_readers:
>> +     oldcount = rwsem_atomic_update(woken, sem) - woken;
>> +     if (!downgrading && (oldcount & RWSEM_ACTIVE_MASK))
>
> The problem with doing this here is that you may just have wasted all the work
> you did working out what woken is going to be.  That may have been quite slow
> as the CPU may have had to get hold of a bunch of cachelines that weren't in
> its cache.  Furthermore, you are doing this under a spinlock, so you may have
> lost your right to wake anyone up, and you'll be blocking the CPU that will be
> allowed to perform the wakeup.
>
> Incrementing the count first nets you a guarantee that you have the right to
> wake things up.
>
> You may point out that if there's no contention, then what your revised code
> does doesn't slow anything down.  That's true, but on modern CPU's, neither
> does the old code as the exclusively held cache line will lurk in the CPU's
> cache until there's contention on it.

My reasoning was more that in all cases except downgrade_write()
(which gets an exemption anyway because we know nobody can grab the
write lock from it) we've already checked the active count and found
it to be zero, so there is not much point checking again at the start
of __rwsem_do_wake(). However you are correct that incrementing the
active count at that point could be important, just to guarantee that
nobody can grab a write lock while we count how many readers we want
to wake.

I'll make sure to address this.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 04/12] rwsem: consistently use adjustment variable
  2010-05-12 11:45   ` David Howells
@ 2010-05-13  1:12     ` Michel Lespinasse
  0 siblings, 0 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-13  1:12 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML, Andrew Morton,
	Mike Waychison, Suleiman Souhlal, Ying Han

On Wed, May 12, 2010 at 4:45 AM, David Howells <dhowells@redhat.com> wrote:
> From a human point of view, I would say keep woken, but use it to initialise
> adjustment:
>
>        adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
>
> and use it to initialise 'loop':
>
>        for (loop = woken; loop > 0; loop--) {

Sounds good. I'll have it in next version.

(Arguably 'woken' might be confusing too given that we only wake the
threads later on. But I don't care much either way).

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 06/12] rwsem: wake queued readers when other readers are  active
  2010-05-12 12:22   ` David Howells
@ 2010-05-13  2:39     ` Michel Lespinasse
  2010-05-13  5:41       ` Michel Lespinasse
  0 siblings, 1 reply; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-13  2:39 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML, Andrew Morton,
	Mike Waychison, Suleiman Souhlal, Ying Han

On Wed, May 12, 2010 at 5:22 AM, David Howells <dhowells@redhat.com> wrote:
> Michel Lespinasse <walken@google.com> wrote:
>
>> In this situation, it would be perfectly fine to let threads B and C work
>> in parallel as they each only want a read acquire on the rwsem. We can
>> recognize this situation and let A wake B as long as there are no active
>> writers on the rwsem.
>
> There can't be any active writers on the rwsem.  An active writer must have
> just been upped and is in the process of waking the first sleeper up.

Yes. My point is that by the point thread A (the writer that just got
upped) gets around to waking B (a blocked reader), another reader C
might have gotten active already. We don't want the nonzero active
count (due to C) to prevent B from getting woken.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common
  2010-05-12 12:42   ` David Howells
@ 2010-05-13  2:54     ` Michel Lespinasse
  0 siblings, 0 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-13  2:54 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML, Andrew Morton,
	Mike Waychison, Suleiman Souhlal, Ying Han

On Wed, May 12, 2010 at 5:42 AM, David Howells <dhowells@redhat.com> wrote:
> Michel Lespinasse <walken@google.com> wrote:
>
>> Following change also enjoys having flags available in a register rather
>> than having to fish it out in the struct rwsem_waiter...
>
> So what?  It doesn't use flags in rwsem_down_failed_common().

I meant patch 10 wants to use the flags in order to know whether to
queue at the head or tail of the waiter list.

Apologies for the compile error, patches 8 and 10 used to be a single
change and were split off relatively late. :/

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 06/12] rwsem: wake queued readers when other readers are  active
  2010-05-13  2:39     ` Michel Lespinasse
@ 2010-05-13  5:41       ` Michel Lespinasse
  0 siblings, 0 replies; 33+ messages in thread
From: Michel Lespinasse @ 2010-05-13  5:41 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, LKML, Andrew Morton,
	Mike Waychison, Suleiman Souhlal, Ying Han

On Wed, May 12, 2010 at 7:39 PM, Michel Lespinasse <walken@google.com> wrote:
> On Wed, May 12, 2010 at 5:22 AM, David Howells <dhowells@redhat.com> wrote:
>> Michel Lespinasse <walken@google.com> wrote:
>>
>>> In this situation, it would be perfectly fine to let threads B and C work
>>> in parallel as they each only want a read acquire on the rwsem. We can
>>> recognize this situation and let A wake B as long as there are no active
>>> writers on the rwsem.
>>
>> There can't be any active writers on the rwsem.  An active writer must have
>> just been upped and is in the process of waking the first sleeper up.
>
> Yes. My point is that by the point thread A (the writer that just got
> upped) gets around to waking B (a blocked reader), another reader C
> might have gotten active already. We don't want the nonzero active
> count (due to C) to prevent B from getting woken.

My bad - this is actually fine. C will notice there are still waiting
threads, so it will run rwsem_down_read_failed and queue itself. At
this point the active count will go back down to 0 and B and C will
both get woken.

I'll merge this back into change 7 since change 7 does require this in
order to work.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2010-05-13  5:41 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-12  3:20 [PATCH 00/12] rwsem changes + down_read_unfair() proposal Michel Lespinasse
2010-05-12  3:20 ` [PATCH 01/12] rwsem: test for no active locks in __rwsem_do_wake undo code Michel Lespinasse
2010-05-12 10:39   ` David Howells
2010-05-12  3:20 ` [PATCH 02/12] rwsem: use single atomic update for sem count when waking up readers Michel Lespinasse
2010-05-12 11:01   ` David Howells
2010-05-13  0:54     ` Michel Lespinasse
2010-05-12 11:36   ` David Howells
2010-05-12  3:20 ` [PATCH 03/12] rwsem: let RWSEM_WAITING_BIAS represent any number of waiting threads Michel Lespinasse
2010-05-12  3:20 ` [PATCH 04/12] rwsem: consistently use adjustment variable Michel Lespinasse
2010-05-12 11:45   ` David Howells
2010-05-13  1:12     ` Michel Lespinasse
2010-05-12  3:20 ` [PATCH 05/12] x86 rwsem: take advantage of new RWSEM_WAITING_BIAS semantics Michel Lespinasse
2010-05-12 12:10   ` David Howells
2010-05-12  3:20 ` [PATCH 06/12] rwsem: wake queued readers when other readers are active Michel Lespinasse
2010-05-12 12:22   ` David Howells
2010-05-13  2:39     ` Michel Lespinasse
2010-05-13  5:41       ` Michel Lespinasse
2010-05-12  3:20 ` [PATCH 07/12] rwsem: wake queued readers when writer blocks on active read lock Michel Lespinasse
2010-05-12 12:33   ` David Howells
2010-05-12  3:20 ` [PATCH 08/12] rwsem: smaller wrappers around rwsem_down_failed_common Michel Lespinasse
2010-05-12 12:36   ` David Howells
2010-05-12 12:42   ` David Howells
2010-05-13  2:54     ` Michel Lespinasse
2010-05-12  3:20 ` [PATCH 09/12] generic rwsem: implement down_read_unfair Michel Lespinasse
2010-05-12 12:46   ` David Howells
2010-05-12  3:21 ` [PATCH 10/12] rwsem: down_read_unfair infrastructure support Michel Lespinasse
2010-05-12  3:21 ` [PATCH 11/12] x86 rwsem: down_read_unfair implementation Michel Lespinasse
2010-05-12 13:08   ` David Howells
2010-05-12  3:21 ` [PATCH 12/12] Use down_read_unfair() for /sys/<pid>/exe and /sys/<pid>/maps files Michel Lespinasse
2010-05-12 13:10   ` David Howells
2010-05-12 22:53   ` KOSAKI Motohiro
2010-05-12 23:35     ` Michel Lespinasse
2010-05-13  0:32       ` KOSAKI Motohiro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).