All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ryan Hope <rmh3093@gmail.com>
To: LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH][RESEND] scalable rw_mutex
Date: Thu, 07 Aug 2008 18:56:45 -0400	[thread overview]
Message-ID: <489B7DAD.3080104@gmail.com> (raw)

This was posted sometime last year I think and it never got merged. Can this get 
a go around in -mm, it would help in converting the semaphore's in reiser4 to 
mutexes.

diff --git a/include/linux/rwmutex.h b/include/linux/rwmutex.h
new file mode 100644
index 0000000..39ec857
--- /dev/null
+++ b/include/linux/rwmutex.h
@@ -0,0 +1,83 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+#ifndef _LINUX_RWMUTEX_H
+#define _LINUX_RWMUTEX_H
+
+#include <linux/preempt.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+struct rw_mutex {
+	/* Read mostly global */
+	struct percpu_counter	readers;
+	unsigned int		status;
+
+	/* The following variables are only for the slowpath */
+	struct mutex		read_mutex;	/* r -> w waiting */
+	struct mutex		write_mutex;	/* w -> w waiting */
+	struct task_struct	*waiter;	/* w -> r waiting */
+	atomic_t		read_waiters;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+};
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
+		struct lock_class_key *key);
+void rw_mutex_destroy(struct rw_mutex *rw_mutex);
+
+#define rw_mutex_init(rw_mutex)					\
+	do {							\
+		static struct lock_class_key __key;		\
+		__rw_mutex_init((rw_mutex), #rw_mutex, &__key);	\
+	} while (0)
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+	int ret = __rw_mutex_read_trylock(rw_mutex);
+	if (ret)
+		rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+
+static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
+{
+	int ret;
+
+	might_sleep();
+	rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
+
+	ret = __rw_mutex_read_trylock(rw_mutex);
+	if (!ret)
+		rw_mutex_read_lock_slow(rw_mutex);
+}
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
+{
+	return mutex_is_locked(&rw_mutex->write_mutex);
+}
+
+static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
+{
+	rw_mutex_write_lock_nested(rw_mutex, 0);
+}
+
+#endif /* _LINUX_RWMUTEX_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index dd58bdc..8277ef5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
  	    rcupdate.o extable.o params.o posix-timers.o \
  	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
  	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o rwmutex.o

  CFLAGS_REMOVE_sched.o = -mno-spe

diff --git a/kernel/rwmutex.c b/kernel/rwmutex.c
new file mode 100644
index 0000000..2b82d11
--- /dev/null
+++ b/kernel/rwmutex.c
@@ -0,0 +1,256 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Its scalable in that the read count is a percpu counter and the reader fast
+ * path does not write to a shared cache-line.
+ *
+ * Its not FIFO fair, but starvation proof by alternating readers and writers.
+ */
+#include <linux/sched.h>
+#include <linux/rwmutex.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+/*
+ * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
+ *
+ * However in this context we take mutex to mean a sleeping lock, with the
+ * property that it must be released by the same context that acquired it.
+ *
+ * design goals:
+ *
+ * A sleeping reader writer lock with a scalable read side, to avoid bouncing
+ * cache-lines.
+ *
+ * dynamics:
+ *
+ * The reader fast path is modification of a percpu_counter and a read of a
+ * shared cache-line.
+ *
+ * The write side is quite heavy; it takes two mutexes, a writer mutex and a
+ * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
+ * for r -> w. The read side is forced into the slow path by setting the
+ * status bit. Then it waits for all current readers to disappear.
+ *
+ * The read lock slow path; taken when the status bit is set; takes the read
+ * mutex. Because the write side also takes this mutex, the new readers are
+ * blocked. The read unlock slow path tickles the writer every time a read
+ * lock is released.
+ *
+ * Write unlock clears the status bit, and drops the read mutex; allowing new
+ * readers. It then waits for at least one waiting reader to get a lock (if
+ * there were any readers waiting) before releasing the write mutex which will
+ * allow possible other writers to come in an stop new readers, thus avoiding
+ * starvation by alternating between readers and writers
+ *
+ * considerations:
+ *
+ * The lock's space footprint is quite large (on x86_64):
+ *
+ *   96 bytes				[struct rw_mutex]
+ *    8 bytes per cpu NR_CPUS		[void *]
+ *   32 bytes per cpu (NR_CPUS ?= cpu_possible_map ?= nr_cpu_ids)
+ *					[smallest slab]
+ *
+ * 1376 bytes for x86_64 defconfig (NR_CPUS = 32)
+ */
+
+#define RW_MUTEX_READER_FAST 	0
+#define RW_MUTEX_READER_SLOW	1
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
+		struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
+	lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
+#endif
+
+	percpu_counter_init(&rw_mutex->readers, 0);
+	rw_mutex->status = RW_MUTEX_READER_FAST;
+	mutex_init(&rw_mutex->read_mutex);
+	mutex_init(&rw_mutex->write_mutex);
+	rw_mutex->waiter = NULL;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	printk("rw_mutex size: %u\n", sizeof(struct rw_mutex));
+#endif
+}
+EXPORT_SYMBOL(__rw_mutex_init);
+
+void rw_mutex_destroy(struct rw_mutex *rw_mutex)
+{
+	percpu_counter_destroy(&rw_mutex->readers);
+	mutex_destroy(&rw_mutex->read_mutex);
+	mutex_destroy(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_destroy);
+
+static inline void rw_mutex_readers_inc(struct rw_mutex *rw_mutex)
+{
+	percpu_counter_inc(&rw_mutex->readers);
+	smp_wmb();
+}
+
+static inline void rw_mutex_readers_dec(struct rw_mutex *rw_mutex)
+{
+	percpu_counter_dec(&rw_mutex->readers);
+	smp_wmb();
+}
+
+static inline long rw_mutex_readers(struct rw_mutex *rw_mutex)
+{
+	smp_rmb();
+	return percpu_counter_sum(&rw_mutex->readers);
+}
+
+#define rw_mutex_writer_wait(rw_mutex, condition)			\
+do {									\
+	struct task_struct *tsk = current;				\
+									\
+	BUG_ON((rw_mutex)->waiter);					\
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);			\
+	get_task_struct(tsk);						\
+	(rw_mutex)->waiter = tsk;					\
+	smp_wmb();							\
+	while (!(condition)) {						\
+		schedule();						\
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);		\
+	}								\
+	tsk->state = TASK_RUNNING;					\
+	(rw_mutex)->waiter = NULL;					\
+	put_task_struct(tsk);						\
+} while (0)
+
+static inline void rw_mutex_writer_wake(struct rw_mutex *rw_mutex)
+{
+	struct task_struct *tsk;
+
+	smp_rmb();
+	tsk = rw_mutex->waiter;
+	if (tsk)
+		wake_up_process(tsk);
+}
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
+{
+	/*
+	 * read lock slow path;
+	 * count the number of readers waiting on the read_mutex
+	 */
+	atomic_inc(&rw_mutex->read_waiters);
+	mutex_lock(&rw_mutex->read_mutex);
+
+	/*
+	 * rw_mutex->state is only set while the read_mutex is held
+	 * so by serialising on this lock, we're sure its free.
+	 */
+	BUG_ON(rw_mutex->status);
+
+	rw_mutex_readers_inc(rw_mutex);
+
+	/*
+	 * wake up a possible write unlock; waiting for at least a single
+	 * reader to pass before letting a new writer through.
+	 */
+	atomic_dec(&rw_mutex->read_waiters);
+	rw_mutex_writer_wake(rw_mutex);
+	mutex_unlock(&rw_mutex->read_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_read_lock_slow);
+
+static inline
+void rw_mutex_status_set(struct rw_mutex *rw_mutex, unsigned int status)
+{
+	rw_mutex->status = status;
+	/*
+	 * allow new readers to see this change in status
+	 */
+	smp_wmb();
+}
+
+static inline unsigned int rw_mutex_reader_slow(struct rw_mutex *rw_mutex)
+{
+	/*
+	 * match rw_mutex_status_set()
+	 */
+	smp_rmb();
+	return rw_mutex->status;
+}
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+	rw_mutex_readers_inc(rw_mutex);
+	if (unlikely(rw_mutex_reader_slow(rw_mutex))) {
+		rw_mutex_readers_dec(rw_mutex);
+		/*
+		 * possibly wake up a writer waiting for this reference to
+		 * disappear
+		 */
+		rw_mutex_writer_wake(rw_mutex);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(__rw_mutex_read_trylock);
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
+{
+	rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+	rw_mutex_readers_dec(rw_mutex);
+	/*
+	 * on the slow path;
+	 * nudge the writer waiting for the last reader to go away
+	 */
+	if (unlikely(rw_mutex_reader_slow(rw_mutex)))
+		rw_mutex_writer_wake(rw_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_read_unlock);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
+
+	mutex_lock_nested(&rw_mutex->write_mutex, subclass);
+
+	/*
+	 * block new readers
+	 */
+	mutex_lock_nested(&rw_mutex->read_mutex, subclass);
+	rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
+	/*
+	 * and wait for all current readers to go away
+	 */
+	rw_mutex_writer_wait(rw_mutex, (rw_mutex_readers(rw_mutex) == 0));
+}
+EXPORT_SYMBOL(rw_mutex_write_lock_nested);
+
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
+{
+	int waiters;
+
+	might_sleep();
+	rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+	/*
+	 * let the readers rip
+	 */
+	rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_FAST);
+	waiters = atomic_read(&rw_mutex->read_waiters);
+	mutex_unlock(&rw_mutex->read_mutex);
+	/*
+	 * wait for at least 1 reader to get through
+	 */
+	if (waiters) {
+		rw_mutex_writer_wait(rw_mutex,
+			(atomic_read(&rw_mutex->read_waiters) < waiters));
+	}
+	/*
+	 * before we let the writers rip
+	 */
+	mutex_unlock(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_write_unlock);

             reply	other threads:[~2008-08-07 22:57 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-08-07 22:56 Ryan Hope [this message]
2008-08-08 10:21 ` [PATCH][RESEND] scalable rw_mutex Peter Zijlstra
2008-08-08 14:04   ` Ryan Hope
2008-08-08 14:15     ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=489B7DAD.3080104@gmail.com \
    --to=rmh3093@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.