public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>, John Stultz <john.stultz@linaro.org>,
	Ingo Molnar <mingo@redhat.com>,
	lkml <linux-kernel@vger.kernel.org>,
	Dmitry Shmidt <dimitrysh@google.com>,
	Rom Lemarchand <romlem@google.com>,
	Colin Cross <ccross@google.com>, Todd Kjos <tkjos@google.com>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Subject: Re: Severe performance regression w/ 4.4+ on Android due to cgroup locking changes
Date: Thu, 14 Jul 2016 21:35:37 +0200	[thread overview]
Message-ID: <20160714193537.GQ30927@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <20160714183609.GJ30154@twins.programming.kicks-ass.net>

On Thu, Jul 14, 2016 at 08:36:09PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 14, 2016 at 08:09:52PM +0200, Oleg Nesterov wrote:
> > On 07/14, Peter Zijlstra wrote:
> > >
> > > The below is a compile tested only first draft so far. I'll go give it
> > > some runtime next.
> > 
> > So I will wait for the new version, but at first glance this matches the
> > code I already reviewed in the past (at least, tried hard to review ;)
> > and it looks correct.
> > 
> > Just a couple of almost cosmetic nits, feel free to ignore.
> 
> Drat, I just mailed out the patches.. I can do a second version later.

How about so on top of what I sent?


--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,11 +13,10 @@ struct percpu_rw_semaphore {
 	unsigned int __percpu	*read_count;
 	struct rw_semaphore	rw_sem;
 	wait_queue_head_t	writer;
-	int			state;
+	int			readers_block;
 };
 
-extern void __percpu_down_read(struct percpu_rw_semaphore *);
-extern int  __percpu_down_read_trylock(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -37,7 +36,7 @@ static inline void percpu_down_read(stru
 	 */
 	__this_cpu_inc(*sem->read_count);
 	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
-		__percpu_down_read(sem); /* Unconditional memory barrier */
+		__percpu_down_read(sem, false); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
 	 * The barrier() from preempt_enable() prevents the compiler from
@@ -55,7 +54,7 @@ static inline int percpu_down_read_trylo
 	 */
 	__this_cpu_inc(*sem->read_count);
 	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
-		ret = __percpu_down_read_trylock(sem);
+		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
 	 * The barrier() from preempt_enable() prevents the compiler from
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,8 +8,6 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-enum { readers_slow, readers_block };
-
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
@@ -21,7 +19,7 @@ int __percpu_init_rwsem(struct percpu_rw
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 	init_waitqueue_head(&sem->writer);
-	sem->state = readers_slow;
+	sem->readers_block = 0;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,21 +39,21 @@ void percpu_free_rwsem(struct percpu_rw_
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-void __percpu_down_read(struct percpu_rw_semaphore *sem)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 {
 	/*
 	 * Due to having preemption disabled the decrement happens on
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * And yes, if the reader misses the writer's assignment of
-	 * readers_block to sem->state, then the writer is
-	 * guaranteed to see the reader's increment.  Conversely, any
-	 * readers that increment their sem->read_count after the
-	 * writer looks are guaranteed to see the readers_block value,
+	 * If the reader misses the writer's assignment of readers_block, then
+	 * the writer is guaranteed to see the reader's increment.
+	 *
+	 * Conversely, any readers that increment their sem->read_count after
+	 * the writer looks are guaranteed to see the readers_block value,
 	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter
-	 * that the writer missed them.
+	 * decrement their sem->read_count, so that it doesn't matter that the
+	 * writer missed them.
 	 */
 
 	smp_mb(); /* A matches D */
@@ -64,8 +62,8 @@ void __percpu_down_read(struct percpu_rw
 	 * If !readers_block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(smp_load_acquire(&sem->state) != readers_block))
-		return;
+	if (likely(!smp_load_acquire(&sem->readers_block)))
+		return 1;
 
 	/*
 	 * Per the above comment; we still have preemption disabled and
@@ -73,6 +71,9 @@ void __percpu_down_read(struct percpu_rw
 	 */
 	__percpu_up_read(sem);
 
+	if (try)
+		return 0;
+
 	/*
 	 * We either call schedule() in the wait, or we'll fall through
 	 * and reschedule on the preempt_enable() in percpu_down_read().
@@ -87,21 +88,10 @@ void __percpu_down_read(struct percpu_rw
 	__up_read(&sem->rw_sem);
 
 	preempt_disable();
+	return 1;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-int __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
-{
-	smp_mb(); /* A matches D */
-
-	if (likely(smp_load_acquire(&sem->state) != readers_block))
-		return 1;
-
-	__percpu_up_read(sem);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__percpu_down_read_trylock);
-
 void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	smp_mb(); /* B matches C */
@@ -150,23 +140,23 @@ static bool readers_active_check(struct
 
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
-	down_write(&sem->rw_sem);
-
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
+	down_write(&sem->rw_sem);
+
 	/*
 	 * Notify new readers to block; up until now, and thus throughout the
 	 * longish rcu_sync_enter() above, new readers could still come in.
 	 */
-	WRITE_ONCE(sem->state, readers_block);
+	WRITE_ONCE(sem->readers_block, 1);
 
 	smp_mb(); /* D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block to sem->state,
-	 * then we are guaranteed to see their sem->read_count increment, and
-	 * therefore will wait for them.
+	 * If they don't see our writer of readers_block, then we are
+	 * guaranteed to see their sem->read_count increment, and therefore
+	 * will wait for them.
 	 */
 
 	/* Wait for all now active readers to complete. */
@@ -186,7 +176,7 @@ void percpu_up_write(struct percpu_rw_se
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->state, readers_slow);
+	smp_store_release(&sem->readers_block, 0);
 
 	/*
 	 * Release the write lock, this will allow readers back in the game.
@@ -194,9 +184,9 @@ void percpu_up_write(struct percpu_rw_se
 	up_write(&sem->rw_sem);
 
 	/*
-	 * Once this completes (at least one RCU grace period hence) the reader
-	 * fast path will be available again. Safe to use outside the exclusive
-	 * write lock because its counting.
+	 * Once this completes (at least one RCU-sched grace period hence) the
+	 * reader fast path will be available again. Safe to use outside the
+	 * exclusive write lock because its counting.
 	 */
 	rcu_sync_exit(&sem->rss);
 }

  reply	other threads:[~2016-07-14 19:35 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-07-13  0:00 Severe performance regression w/ 4.4+ on Android due to cgroup locking changes John Stultz
2016-07-13  8:21 ` Peter Zijlstra
2016-07-13 14:42   ` Paul E. McKenney
2016-07-13 18:13     ` Dmitry Shmidt
2016-07-13 18:32       ` Paul E. McKenney
2016-07-13 18:21 ` Tejun Heo
2016-07-13 18:33   ` Tejun Heo
2016-07-13 20:13     ` John Stultz
2016-07-13 20:18       ` Tejun Heo
2016-07-13 20:26         ` Peter Zijlstra
2016-07-13 20:39           ` Tejun Heo
2016-07-13 20:51             ` Peter Zijlstra
2016-07-13 21:01               ` Tejun Heo
2016-07-13 21:03               ` Paul E. McKenney
2016-07-13 21:05                 ` Tejun Heo
2016-07-13 21:18                   ` Paul E. McKenney
2016-07-13 21:42                     ` Paul E. McKenney
2016-07-13 21:46                       ` John Stultz
2016-07-13 22:17                         ` Paul E. McKenney
2016-07-13 22:39                           ` John Stultz
2016-07-13 23:02                             ` Paul E. McKenney
2016-07-13 23:04                               ` Paul E. McKenney
2016-07-14 11:35                                 ` Tejun Heo
2016-07-14 12:04                                   ` Peter Zijlstra
2016-07-14 12:08                                     ` Tejun Heo
2016-07-14 12:20                                       ` Peter Zijlstra
2016-07-14 15:07                                         ` Tejun Heo
2016-07-14 15:24                                           ` Tejun Heo
2016-07-14 16:32                                           ` Peter Zijlstra
2016-07-14 17:34                                             ` Oleg Nesterov
2016-07-14 16:54                               ` John Stultz
2016-07-13 22:25                       ` John Stultz
2016-07-13 22:01                     ` Tejun Heo
2016-07-13 22:33                       ` Paul E. McKenney
2016-07-14  6:49                       ` Peter Zijlstra
2016-07-14 11:20                         ` Tejun Heo
2016-07-14 12:11                           ` Peter Zijlstra
2016-07-14 15:14                             ` Tejun Heo
2016-07-14 13:18               ` Peter Zijlstra
2016-07-14 14:14                 ` Peter Zijlstra
2016-07-14 14:58                 ` Oleg Nesterov
2016-07-14 16:14                   ` Peter Zijlstra
2016-07-14 16:37                   ` Peter Zijlstra
2016-07-14 17:05                     ` Oleg Nesterov
2016-07-14 16:23                 ` Paul E. McKenney
2016-07-14 16:45                   ` Peter Zijlstra
2016-07-14 17:15                     ` Paul E. McKenney
2016-07-14 16:43                 ` John Stultz
2016-07-14 16:49                   ` Peter Zijlstra
2016-07-14 17:02                     ` John Stultz
2016-07-14 17:13                       ` Oleg Nesterov
2016-07-14 17:30                         ` John Stultz
2016-07-14 17:41                           ` Oleg Nesterov
2016-07-14 17:51                             ` John Stultz
2016-07-14 18:09                 ` Oleg Nesterov
2016-07-14 18:36                   ` Peter Zijlstra
2016-07-14 19:35                     ` Peter Zijlstra [this message]
2016-07-13 20:57             ` John Stultz
2016-07-13 20:52           ` Paul E. McKenney
2016-07-13 20:57             ` Peter Zijlstra
2016-07-13 21:08               ` Paul E. McKenney
2016-07-13 21:01             ` Dmitry Shmidt
2016-07-13 21:03               ` John Stultz
2016-07-13 21:05               ` Paul E. McKenney
2016-07-13 20:31     ` Dmitry Shmidt
2016-07-13 20:44   ` Colin Cross
2016-07-13 20:54     ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160714193537.GQ30927@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=ccross@google.com \
    --cc=dimitrysh@google.com \
    --cc=john.stultz@linaro.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=romlem@google.com \
    --cc=tj@kernel.org \
    --cc=tkjos@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox