Re: [RFC][PATCH v2 5/5] mutex: Give spinners a chance to spin_on_owner if need_resched() triggered while queued

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Waiman Long <waiman.long@hp.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Low <jason.low2@hp.com>,
	mingo@redhat.com, paulmck@linux.vnet.ibm.com,
	torvalds@linux-foundation.org, tglx@linutronix.de,
	linux-kernel@vger.kernel.org, riel@redhat.com,
	akpm@linux-foundation.org, davidlohr@hp.com, hpa@zytor.com,
	andi@firstfloor.org, aswin@hp.com, scott.norton@hp.com,
	chegu_vinod@hp.com
Subject: Re: [RFC][PATCH v2 5/5] mutex: Give spinners a chance to spin_on_owner if need_resched() triggered while queued
Date: Wed, 05 Feb 2014 16:44:34 -0500	[thread overview]
Message-ID: <52F2B0C2.8040408@hp.com> (raw)
In-Reply-To: <20140129115142.GE9636@twins.programming.kicks-ass.net>

On 01/29/2014 06:51 AM, Peter Zijlstra wrote:
> On Tue, Jan 28, 2014 at 02:51:35PM -0800, Jason Low wrote:
>>> But urgh, nasty problem. Lemme ponder this a bit.
> OK, please have a very careful look at the below. It survived a boot
> with udev -- which usually stresses mutex contention enough to explode
> (in fact it did a few time when I got the contention/cancel path wrong),
> however I have not ran anything else on it.
>
> The below is an MCS variant that allows relatively cheap unqueueing. But
> its somewhat tricky and I might have gotten a case wrong, esp. the
> double concurrent cancel case got my head hurting (I didn't attempt a
> tripple unqueue).
>
> Applies to tip/master but does generate a few (harmless) compile
> warnings because I didn't fully clean up the mcs_spinlock vs m_spinlock
> thing.
>
> Also, there's a comment in the slowpath that bears consideration.
>
>

I have an alternative way of breaking out of the MCS lock waiting queue 
when need_resched() is set. I overload the locked flag to indicate a 
skipped node if negative. I run the patch through the AIM7 high-systime 
workload on a 4-socket server and it seemed to run fine.

Please check the following POC patch to see if you have any comment.
diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index e9a4d74..84a0b32 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -14,17 +14,45 @@

  struct mcs_spinlock {
      struct mcs_spinlock *next;
-    int locked; /* 1 if lock acquired */
+    int locked; /* 1 if lock acquired, -1 if skipping node */
  };

+/*
+ * The node skipping feature requires the MCS node to be persistent,
+ * i.e. not allocated on stack. In addition, the MCS node cannot be
+ * reused until after the locked flag is cleared. The mcs_skip_node()
+ * macro must be defined before including this header file.
+ */
+#ifdef mcs_skip_node
+#undef arch_mcs_spin_lock_contended
+#undef arch_mcs_spin_unlock_contended
+
+#define    arch_mcs_spin_lock_contended(n)                    \
+do {                                    \
+    while (!smp_load_acquire(&(n)->locked)) {            \
+        if (mcs_skip_node(n)) {                    \
+            if (cmpxchg(&(n)->locked, 0, -1) == 0)        \
+                return;                    \
+        }                            \
+        arch_mutex_cpu_relax();                    \
+    };                                \
+} while (0)
+
+#define    arch_mcs_spin_unlock_contended(n)                \
+    if (cmpxchg(&(n)->locked, 0, 1) == 0)                \
+        return
+
+#define mcs_set_locked(n, v)    (n)->locked = (v)
+#else    /* mcs_skip_node */
+
  #ifndef arch_mcs_spin_lock_contended
  /*
   * Using smp_load_acquire() provides a memory barrier that ensures
   * subsequent operations happen after the lock is acquired.
   */
-#define arch_mcs_spin_lock_contended(l)                    \
+#define arch_mcs_spin_lock_contended(n)                    \
  do {                                    \
-    while (!(smp_load_acquire(l)))                    \
+    while (!(smp_load_acquire(&(n)->locked)))            \
          arch_mutex_cpu_relax();                    \
  } while (0)
  #endif
@@ -35,9 +63,12 @@ do {                                \
   * operations in the critical section has been completed before
   * unlocking.
   */
-#define arch_mcs_spin_unlock_contended(l)                \
-    smp_store_release((l), 1)
+#define arch_mcs_spin_unlock_contended(n)                \
+    smp_store_release(&(n)->locked, 1)
+
+#define mcs_set_locked(n, v)
  #endif
+#endif    /* mcs_skip_node */

  /*
   * Note: the smp_load_acquire/smp_store_release pair is not
@@ -77,12 +108,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, 
struct mcs_spinlock *node)
           * value won't be used. If a debug mode is needed to
           * audit lock status, then set node->locked value here.
           */
+        mcs_set_locked(node, 1);
          return;
      }
      ACCESS_ONCE(prev->next) = node;

      /* Wait until the lock holder passes the lock down. */
-    arch_mcs_spin_lock_contended(&node->locked);
+    arch_mcs_spin_lock_contended(node);
  }

  /*
@@ -94,19 +126,35 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, 
struct mcs_spinlock *node)
  {
      struct mcs_spinlock *next = ACCESS_ONCE(node->next);

+#ifdef mcs_skip_node
+check_next_node:
+#endif
      if (likely(!next)) {
          /*
           * Release the lock by setting it to NULL
           */
-        if (likely(cmpxchg(lock, node, NULL) == node))
+        if (likely(cmpxchg(lock, node, NULL) == node)) {
+            mcs_set_locked(node, 0);
              return;
+        }
          /* Wait until the next pointer is set */
          while (!(next = ACCESS_ONCE(node->next)))
              arch_mutex_cpu_relax();
      }
+    /* Clear the lock flag to indicate the node can be used again. */
+    mcs_set_locked(node, 0);

      /* Pass lock to next waiter. */
-    arch_mcs_spin_unlock_contended(&next->locked);
+    arch_mcs_spin_unlock_contended(next);
+
+#ifdef mcs_skip_node
+    /*
+     * The next node should be skipped
+     */
+    node = next;
+    next = ACCESS_ONCE(node->next);
+    goto check_next_node;
+#endif
  }

  #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 45fe1b5..6351eca 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,10 @@
  #include <linux/spinlock.h>
  #include <linux/interrupt.h>
  #include <linux/debug_locks.h>
+/*
+ * Allowed an MCS node to be skipped if need_resched() is true.
+ */
+#define mcs_skip_node(n)    need_resched()
  #include <linux/mcs_spinlock.h>

  /*
@@ -45,6 +49,13 @@
   */
  #define    MUTEX_SHOW_NO_WAITER(mutex)    
(atomic_read(&(mutex)->count) >= 0)

+/*
+ * Using a single mcs node per CPU is safe because mutex_lock() should 
not be
+ * called from interrupt context and we have preemption disabled over 
the mcs
+ * lock usage.
+ */
+static DEFINE_PER_CPU(struct mcs_spinlock, m_node);
+
  void
  __mutex_init(struct mutex *lock, const char *name, struct 
lock_class_key *key)
  {
@@ -166,6 +177,9 @@ static inline int mutex_can_spin_on_owner(struct 
mutex *lock)
      struct task_struct *owner;
      int retval = 1;

+    if (need_resched())
+        return 0;
+
      rcu_read_lock();
      owner = ACCESS_ONCE(lock->owner);
      if (owner)
@@ -370,6 +384,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
      struct mutex_waiter waiter;
      unsigned long flags;
      int ret;
+    struct mcs_spinlock  *node;

      preempt_disable();
      mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -400,9 +415,13 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
      if (!mutex_can_spin_on_owner(lock))
          goto slowpath;

+    node = this_cpu_ptr(&m_node);
+    if (node->locked < 0)
+        /* MCS node not available yet */
+        goto slowpath;
+
      for (;;) {
          struct task_struct *owner;
-        struct mcs_spinlock  node;

          if (use_ww_ctx && ww_ctx->acquired > 0) {
              struct ww_mutex *ww;
@@ -424,10 +443,15 @@ __mutex_lock_common(struct mutex *lock, long 
state, unsigned int subclass,
           * If there's an owner, wait for it to either
           * release the lock or go to sleep.
           */
-        mcs_spin_lock(&lock->mcs_lock, &node);
+        mcs_spin_lock(&lock->mcs_lock, node);
+        if (node->locked < 0)
+            /*
+             * need_resched() true, no unlock needed
+             */
+            goto slowpath;
          owner = ACCESS_ONCE(lock->owner);
          if (owner && !mutex_spin_on_owner(lock, owner)) {
-            mcs_spin_unlock(&lock->mcs_lock, &node);
+            mcs_spin_unlock(&lock->mcs_lock, node);
              goto slowpath;
          }

@@ -442,11 +466,11 @@ __mutex_lock_common(struct mutex *lock, long 
state, unsigned int subclass,
              }

              mutex_set_owner(lock);
-            mcs_spin_unlock(&lock->mcs_lock, &node);
+            mcs_spin_unlock(&lock->mcs_lock, node);
              preempt_enable();
              return 0;
          }
-        mcs_spin_unlock(&lock->mcs_lock, &node);
+        mcs_spin_unlock(&lock->mcs_lock, node);

          /*
           * When there's no owner, we might have preempted between the

next prev parent reply	other threads:[~2014-02-05 21:45 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-01-28 19:13 [PATCH v2 0/5] mutex: Mutex scalability patches Jason Low
2014-01-28 19:13 ` [PATCH v2 1/5] mutex: In mutex_can_spin_on_owner(), return false if task need_resched() Jason Low
2014-01-28 20:20   ` Paul E. McKenney
2014-01-28 22:01     ` Jason Low
2014-01-28 21:09   ` Davidlohr Bueso
2014-03-11 12:41   ` [tip:core/locking] locking/mutexes: Return false if task need_resched() in mutex_can_spin_on_owner() tip-bot for Jason Low
2014-01-28 19:13 ` [PATCH v2 2/5] mutex: Modify the way optimistic spinners are queued Jason Low
2014-01-28 20:23   ` Paul E. McKenney
2014-01-28 20:24     ` Paul E. McKenney
2014-01-28 21:17     ` Davidlohr Bueso
2014-01-28 22:10     ` Jason Low
2014-02-02 21:58       ` Paul E. McKenney
2014-03-11 12:41   ` [tip:core/locking] locking/mutexes: " tip-bot for Jason Low
2014-03-11 15:24     ` Jason Low
2014-03-11 15:33       ` Peter Zijlstra
2014-01-28 19:13 ` [PATCH v2 3/5] mutex: Unlock the mutex without the wait_lock Jason Low
2014-03-11 12:41   ` [tip:core/locking] locking/mutexes: " tip-bot for Jason Low
2014-03-12 12:24     ` Peter Zijlstra
2014-03-12 18:44       ` Jason Low
2014-03-13  7:28       ` [tip:core/locking] locking/mutex: Fix debug checks tip-bot for Peter Zijlstra
2014-01-28 19:13 ` [RFC][PATCH v2 4/5] mutex: Disable preemtion between modifying lock->owner and locking/unlocking mutex Jason Low
2014-01-28 20:54   ` Peter Zijlstra
2014-01-28 22:17     ` Jason Low
2014-01-28 19:13 ` [RFC][PATCH v2 5/5] mutex: Give spinners a chance to spin_on_owner if need_resched() triggered while queued Jason Low
2014-01-28 21:07   ` Peter Zijlstra
2014-01-28 22:51     ` Jason Low
2014-01-29 11:51       ` Peter Zijlstra
2014-01-31  3:29         ` Jason Low
2014-01-31 14:09           ` Peter Zijlstra
2014-01-31 20:01             ` Jason Low
2014-01-31 20:08               ` Peter Zijlstra
2014-02-02 21:01                 ` Jason Low
2014-02-02 21:12                   ` Peter Zijlstra
2014-02-03 18:39                     ` Jason Low
2014-02-03 19:25                       ` Peter Zijlstra
2014-02-03 20:55                         ` Jason Low
2014-02-03 21:06                           ` Peter Zijlstra
2014-02-03 21:56                             ` Jason Low
2014-02-04  7:13                         ` Jason Low
2014-02-02 22:02                 ` Paul E. McKenney
2014-02-02 20:02             ` Peter Zijlstra
2014-02-05 21:44         ` Waiman Long [this message]
2014-02-06 14:04           ` Peter Zijlstra
2014-02-06 18:45             ` Waiman Long
2014-02-06 20:10               ` Norton, Scott J
2014-02-10 17:01                 ` Peter Zijlstra
2014-02-06 17:44           ` Jason Low
2014-02-06 18:37             ` Waiman Long
2014-01-28 21:08 ` [PATCH v2 0/5] mutex: Mutex scalability patches Davidlohr Bueso
2014-01-28 23:11   ` Jason Low
  -- strict thread matches above, loose matches on Subject: below --
2014-02-06 14:52 [RFC][PATCH v2 5/5] mutex: Give spinners a chance to spin_on_owner if need_resched() triggered while queued Daniel J Blueman

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:e9a4d74 dfblob:84a0b32 dfblob:45fe1b5 dfblob:6351eca )
 OR (
bs:"Re: [RFC][PATCH v2 5/5] mutex: Give spinners a chance to spin_on_owner if need_resched() triggered while queued" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52F2B0C2.8040408@hp.com \
    --to=waiman.long@hp.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=aswin@hp.com \
    --cc=chegu_vinod@hp.com \
    --cc=davidlohr@hp.com \
    --cc=hpa@zytor.com \
    --cc=jason.low2@hp.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=riel@redhat.com \
    --cc=scott.norton@hp.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).