From: Peter Zijlstra <peterz@infradead.org>
To: Oleg Nesterov <oleg@redhat.com>,
Will Deacon <will.deacon@kernel.org>,
Ingo Molnar <mingo@kernel.org>,
Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org, bigeasy@linutronix.de,
juri.lelli@redhat.com, williams@redhat.com, bristot@redhat.com,
longman@redhat.com, dave@stgolabs.net, oleg@redhat.com,
jack@suse.com
Subject: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
Date: Mon, 5 Aug 2019 16:02:41 +0200 [thread overview]
Message-ID: <20190805140241.GI2332@hirez.programming.kicks-ass.net> (raw)
The filesystem freezer uses percpu_rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.
When -RT re-implements rwsem this house of cards comes undone.
Re-implement percpu_rwsem without relying on rwsem.
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: jack@suse.com
---
include/linux/percpu-rwsem.h | 72 +++++++++++++-------------
include/linux/wait.h | 3 +
kernel/cpu.c | 4 -
kernel/locking/percpu-rwsem.c | 116 +++++++++++++++++++++++++-----------------
4 files changed, 112 insertions(+), 83 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,39 +5,49 @@
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
-#include <linux/rcuwait.h>
+#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
struct percpu_rw_semaphore {
struct rcu_sync rss;
unsigned int __percpu *read_count;
- struct rw_semaphore rw_sem; /* slowpath */
- struct rcuwait writer; /* blocked writer */
- int readers_block;
+ wait_queue_head_t waiters;
+ atomic_t block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
#define __DEFINE_PERCPU_RWSEM(name, is_static) \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \
is_static struct percpu_rw_semaphore name = { \
.rss = __RCU_SYNC_INITIALIZER(name.rss), \
.read_count = &__percpu_rwsem_rc_##name, \
- .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
- .writer = __RCUWAIT_INITIALIZER(name.writer), \
+ .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters), \
+ .block = ATOMIC_INIT(0), \
+ __PERCPU_RWSEM_DEP_MAP_INIT(name) \
}
+
#define DEFINE_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, static)
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
extern void __percpu_up_read(struct percpu_rw_semaphore *);
static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
might_sleep();
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
preempt_disable();
/*
@@ -58,42 +68,42 @@ static inline void percpu_down_read(stru
preempt_enable();
}
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- int ret = 1;
-
preempt_disable();
/*
* Same as in percpu_down_read().
*/
__this_cpu_inc(*sem->read_count);
- if (unlikely(!rcu_sync_is_idle(&sem->rss)))
- ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+ if (unlikely(!rcu_sync_is_idle(&sem->rss))) {
+ if (!__percpu_down_read(sem, true)) /* Unconditional memory barrier */
+ return false;
+ }
preempt_enable();
/*
* The barrier() from preempt_enable() prevents the compiler from
* bleeding the critical section out.
*/
- if (ret)
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
-
- return ret;
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ return true;
}
static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
preempt_disable();
/*
* Same as in percpu_down_read().
*/
- if (likely(rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss))) {
__this_cpu_dec(*sem->read_count);
- else
- __percpu_up_read(sem); /* Unconditional memory barrier */
- preempt_enable();
+ preempt_enable();
+ return;
+ }
- rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+ __percpu_up_read(sem); /* Unconditional memory barrier */
}
extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,29 +120,19 @@ extern void percpu_free_rwsem(struct per
__percpu_init_rwsem(sem, #sem, &rwsem_key); \
})
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem) \
- lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem) lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem)
static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
- lock_release(&sem->rw_sem.dep_map, 1, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- if (!read)
- atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
+ lock_release(&sem->dep_map, 1, ip);
}
static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
- lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- if (!read)
- atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
+ lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}
#endif
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -408,6 +408,9 @@ do { \
__wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \
} while (0)
+#define wait_event_exclusive(wq_head, condition) \
+ wait_event_exclusive_cmd(wq_head, condition, ,)
+
#define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
cmd1; schedule(); cmd2)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
static void lockdep_acquire_cpus_lock(void)
{
- rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+ rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}
static void lockdep_release_cpus_lock(void)
{
- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+ rwsem_release(&cpu_hotplug_lock.dep_map, 1, _THIS_IP_);
}
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -2,6 +2,7 @@
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
+#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
@@ -11,17 +12,19 @@
#include "rwsem.h"
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
- const char *name, struct lock_class_key *rwsem_key)
+ const char *name, struct lock_class_key *key)
{
sem->read_count = alloc_percpu(int);
if (unlikely(!sem->read_count))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
rcu_sync_init(&sem->rss);
- __init_rwsem(&sem->rw_sem, name, rwsem_key);
- rcuwait_init(&sem->writer);
- sem->readers_block = 0;
+ init_waitqueue_head(&sem->waiters);
+ atomic_set(&sem->block, 0);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,59 +44,62 @@ void percpu_free_rwsem(struct percpu_rw_
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+/*
+ * Called with preemption disabled, and returns with preemption disabled,
+ * except when it fails the trylock.
+ */
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*
- * If the reader misses the writer's assignment of readers_block, then
- * the writer is guaranteed to see the reader's increment.
+ * If the reader misses the writer's assignment of sem->block, then the
+ * writer is guaranteed to see the reader's increment.
*
* Conversely, any readers that increment their sem->read_count after
- * the writer looks are guaranteed to see the readers_block value,
- * which in turn means that they are guaranteed to immediately
- * decrement their sem->read_count, so that it doesn't matter that the
- * writer missed them.
+ * the writer looks are guaranteed to see the sem->block value, which
+ * in turn means that they are guaranteed to immediately decrement
+ * their sem->read_count, so that it doesn't matter that the writer
+ * missed them.
*/
+again:
smp_mb(); /* A matches D */
/*
- * If !readers_block the critical section starts here, matched by the
+ * If !sem->block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(!smp_load_acquire(&sem->readers_block)))
- return 1;
+ if (likely(!atomic_read_acquire(&sem->block)))
+ return true;
/*
* Per the above comment; we still have preemption disabled and
* will thus decrement on the same CPU as we incremented.
*/
- __percpu_up_read(sem);
+ __percpu_up_read(sem); /* implies preempt_enable() */
if (try)
- return 0;
+ return false;
- /*
- * We either call schedule() in the wait, or we'll fall through
- * and reschedule on the preempt_enable() in percpu_down_read().
- */
- preempt_enable_no_resched();
+ wait_event(sem->waiters, !atomic_read_acquire(&sem->block));
+ preempt_disable();
+ __this_cpu_inc(*sem->read_count);
/*
- * Avoid lockdep for the down/up_read() we already have them.
+ * percpu_down_write() could've set sem->block right after we've seen
+ * it 0 but missed our this_cpu_inc(), which is exactly the condition
+ * we get called for from percpu_down_read().
*/
- __down_read(&sem->rw_sem);
- this_cpu_inc(*sem->read_count);
- __up_read(&sem->rw_sem);
-
- preempt_disable();
- return 1;
+ goto again;
}
EXPORT_SYMBOL_GPL(__percpu_down_read);
+/*
+ * Called with preemption disabled, returns with preemption enabled.
+ */
void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
smp_mb(); /* B matches C */
@@ -103,9 +109,10 @@ void __percpu_up_read(struct percpu_rw_s
* critical section.
*/
__this_cpu_dec(*sem->read_count);
+ preempt_enable();
- /* Prod writer to recheck readers_active */
- rcuwait_wake_up(&sem->writer);
+ /* Prod writer to re-evaluate readers_active_check() */
+ wake_up(&sem->waiters);
}
EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -124,6 +131,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
* zero. If this sum is zero, then it is stable due to the fact that if any
* newly arriving readers increment a given counter, they will immediately
* decrement that same counter.
+ *
+ * Assumes sem->block is set.
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
@@ -140,29 +149,37 @@ static bool readers_active_check(struct
return true;
}
+static inline bool try_acquire_block(struct percpu_rw_semaphore *sem)
+{
+ if (atomic_read(&sem->block))
+ return false;
+
+ return atomic_xchg(&sem->block, 1) == 0;
+}
+
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- down_write(&sem->rw_sem);
-
/*
- * Notify new readers to block; up until now, and thus throughout the
- * longish rcu_sync_enter() above, new readers could still come in.
+ * Try set sem->block; this provides writer-writer exclusion.
+ * Having sem->block set makes new readers block.
*/
- WRITE_ONCE(sem->readers_block, 1);
+ wait_event_exclusive(sem->waiters, try_acquire_block(sem));
- smp_mb(); /* D matches A */
+ /* smp_mb() implied by acquire_block() on success -- D matches A */
/*
- * If they don't see our writer of readers_block, then we are
- * guaranteed to see their sem->read_count increment, and therefore
- * will wait for them.
+ * If they don't see our store of sem->block, then we are guaranteed to
+ * see their sem->read_count increment, and therefore will wait for
+ * them.
*/
- /* Wait for all now active readers to complete. */
- rcuwait_wait_event(&sem->writer, readers_active_check(sem));
+ /* Wait for all active readers to complete. */
+ wait_event(sem->waiters, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -178,12 +195,19 @@ void percpu_up_write(struct percpu_rw_se
* Therefore we force it through the slow path which guarantees an
* acquire and thereby guarantees the critical section's consistency.
*/
- smp_store_release(&sem->readers_block, 0);
+ atomic_set_release(&sem->block, 0);
/*
- * Release the write lock, this will allow readers back in the game.
+ * Prod any pending reader/writer to make progress.
+ *
+ * While there is no fairness guarantee; readers are waiting !exclusive
+ * and will thus be on the wait_list head, while writers are waiting
+ * exclusive and will thus be on the wait_list tail.
+ *
+ * Therefore it is more likely a reader will acquire the lock; if there
+ * are any.
*/
- up_write(&sem->rw_sem);
+ wake_up(&sem->waiters);
/*
* Once this completes (at least one RCU-sched grace period hence) the
@@ -191,5 +215,7 @@ void percpu_up_write(struct percpu_rw_se
* exclusive write lock because its counting.
*/
rcu_sync_exit(&sem->rss);
+
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
}
EXPORT_SYMBOL_GPL(percpu_up_write);
next reply other threads:[~2019-08-05 14:02 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-08-05 14:02 Peter Zijlstra [this message]
2019-08-05 14:43 ` [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem Boqun Feng
2019-08-05 14:58 ` Boqun Feng
2019-08-05 15:43 ` Peter Zijlstra
2019-08-06 14:15 ` Boqun Feng
2019-08-06 16:17 ` Oleg Nesterov
2019-08-06 17:15 ` Peter Zijlstra
2019-08-07 9:56 ` Oleg Nesterov
2019-10-29 18:47 ` Peter Zijlstra
2019-10-30 14:21 ` Oleg Nesterov
2019-10-30 16:09 ` Peter Zijlstra
2019-10-30 17:52 ` Peter Zijlstra
2019-10-30 18:47 ` Peter Zijlstra
2019-10-30 19:31 ` Peter Zijlstra
2019-10-31 6:11 ` Peter Zijlstra
2019-08-07 14:45 ` Will Deacon
2019-10-29 19:06 ` Peter Zijlstra
2019-10-30 15:57 ` Oleg Nesterov
2019-10-30 16:47 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190805140241.GI2332@hirez.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=bigeasy@linutronix.de \
--cc=bristot@redhat.com \
--cc=dave@stgolabs.net \
--cc=jack@suse.com \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mingo@kernel.org \
--cc=oleg@redhat.com \
--cc=tglx@linutronix.de \
--cc=will.deacon@kernel.org \
--cc=williams@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.