[PATCH 06/13] locking/qspinlock: merge qspinlock_paravirt.h into qspinlock.c

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Nicholas Piggin <npiggin@gmail.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>,
	Ingo Molnar <mingo@redhat.com>, Will Deacon <will@kernel.org>,
	Waiman Long <longman@redhat.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH 06/13] locking/qspinlock: merge qspinlock_paravirt.h into qspinlock.c
Date: Tue,  5 Jul 2022 00:38:13 +1000	[thread overview]
Message-ID: <20220704143820.3071004-7-npiggin@gmail.com> (raw)
In-Reply-To: <20220704143820.3071004-1-npiggin@gmail.com>

There isn't much reason to keep these separate.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 kernel/locking/qspinlock.c          | 488 ++++++++++++++++++++++++++-
 kernel/locking/qspinlock_paravirt.h | 490 ----------------------------
 2 files changed, 487 insertions(+), 491 deletions(-)
 delete mode 100644 kernel/locking/qspinlock_paravirt.h

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b96c58ca51de..9a235b0d98ca 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -16,6 +16,7 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/memblock.h>
 #include <linux/mutex.h>
 #include <linux/prefetch.h>
 #include <asm/byteorder.h>
@@ -284,7 +285,492 @@ static __always_inline void set_locked(struct qspinlock *lock)
 }
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
-#include "qspinlock_paravirt.h"
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
+
+/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK	0xff
+
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
+enum vcpu_state {
+	vcpu_running = 0,
+	vcpu_halted,		/* Used only in pv_wait_node */
+	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
+};
+
+/*
+ * Hybrid PV queued/unfair lock
+ *
+ * This function is called once when a lock waiter enters the PV slowpath
+ * before being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
+ */
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
+{
+	/*
+	 * Stay in unfair lock mode as long as queued mode waiters are
+	 * present in the MCS wait queue but the pending bit isn't set.
+	 */
+	for (;;) {
+		int val = atomic_read(&lock->val);
+
+		if (!(val & _Q_LOCKED_PENDING_MASK) &&
+		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
+			lockevent_inc(pv_lock_stealing);
+			return true;
+		}
+		if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+			break;
+
+		cpu_relax();
+	}
+
+	return false;
+}
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+	struct qspinlock *lock;
+	struct qnode   *node;
+};
+
+#define PV_HE_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN	(PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+	if (pv_hash_size < PV_HE_MIN)
+		pv_hash_size = PV_HE_MIN;
+
+	/*
+	 * Allocate space from bootmem which should be page-size aligned
+	 * and hence cacheline aligned.
+	 */
+	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+					       sizeof(struct pv_hash_entry),
+					       pv_hash_size, 0,
+					       HASH_EARLY | HASH_ZERO,
+					       &pv_lock_hash_bits, NULL,
+					       pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash)						\
+	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;	\
+	     offset < (1 << pv_lock_hash_bits);						\
+	     offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct qnode *node)
+{
+	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+	struct pv_hash_entry *he;
+	int hopcnt = 0;
+
+	for_each_hash_entry(he, offset, hash) {
+		hopcnt++;
+		if (!cmpxchg(&he->lock, NULL, lock)) {
+			WRITE_ONCE(he->node, node);
+			lockevent_pv_hop(hopcnt);
+			return &he->lock;
+		}
+	}
+	/*
+	 * Hard assume there is a free entry for us.
+	 *
+	 * This is guaranteed by ensuring every blocked lock only ever consumes
+	 * a single entry, and since we only have 4 nesting levels per CPU
+	 * and allocated 4*nr_possible_cpus(), this must be so.
+	 *
+	 * The single entry is guaranteed by having the lock owner unhash
+	 * before it releases.
+	 */
+	BUG();
+}
+
+static struct qnode *pv_unhash(struct qspinlock *lock)
+{
+	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+	struct pv_hash_entry *he;
+	struct qnode *node;
+
+	for_each_hash_entry(he, offset, hash) {
+		if (READ_ONCE(he->lock) == lock) {
+			node = READ_ONCE(he->node);
+			WRITE_ONCE(he->lock, NULL);
+			return node;
+		}
+	}
+	/*
+	 * Hard assume we'll find an entry.
+	 *
+	 * This guarantees a limited lookup time and is itself guaranteed by
+	 * having the lock owner do the unhash -- IFF the unlock sees the
+	 * SLOW flag, there MUST be a hash entry.
+	 */
+	BUG();
+}
+
+/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct qnode *prev, int loop)
+{
+	if ((loop & PV_PREV_CHECK_MASK) != 0)
+		return false;
+
+	return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
+ * Initialize the PV part of the qnode.
+ */
+static void pv_init_node(struct qnode *node)
+{
+	node->cpu = smp_processor_id();
+	node->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
+ */
+static void pv_wait_node(struct qnode *node, struct qnode *prev)
+{
+	int loop;
+	bool wait_early;
+
+	for (;;) {
+		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
+			if (READ_ONCE(node->locked))
+				return;
+			if (pv_wait_early(prev, loop)) {
+				wait_early = true;
+				break;
+			}
+			cpu_relax();
+		}
+
+		/*
+		 * Order node->state vs node->locked thusly:
+		 *
+		 * [S] node->state = vcpu_halted  [S] next->locked = 1
+		 *     MB                             MB
+		 * [L] node->locked             [RmW] node->state = vcpu_hashed
+		 *
+		 * Matches the cmpxchg() from pv_kick_node().
+		 */
+		smp_store_mb(node->state, vcpu_halted);
+
+		if (!READ_ONCE(node->locked)) {
+			lockevent_inc(pv_wait_node);
+			lockevent_cond_inc(pv_wait_early, wait_early);
+			pv_wait(&node->state, vcpu_halted);
+		}
+
+		/*
+		 * If pv_kick_node() changed us to vcpu_hashed, retain that
+		 * value so that pv_wait_head_or_lock() knows to not also try
+		 * to hash this lock.
+		 */
+		cmpxchg(&node->state, vcpu_halted, vcpu_running);
+
+		/*
+		 * If the locked flag is still not set after wakeup, it is a
+		 * spurious wakeup and the vCPU should wait again. However,
+		 * there is a pretty high overhead for CPU halting and kicking.
+		 * So it is better to spin for a while in the hope that the
+		 * MCS lock will be released soon.
+		 */
+		lockevent_cond_inc(pv_spurious_wakeup,
+				  !READ_ONCE(node->locked));
+	}
+
+	/*
+	 * By now our node->locked should be 1 and our caller will not actually
+	 * spin-wait for it. We do however rely on our caller to do a
+	 * load-acquire for us.
+	 */
+}
+
+/*
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
+ */
+static void pv_kick_node(struct qspinlock *lock, struct qnode *node)
+{
+	/*
+	 * If the vCPU is indeed halted, advance its state to match that of
+	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+	 * observe its next->locked value and advance itself.
+	 *
+	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+	 *
+	 * The write to next->locked in arch_mcs_spin_unlock_contended()
+	 * must be ordered before the read of node->state in the cmpxchg()
+	 * below for the code to work correctly. To guarantee full ordering
+	 * irrespective of the success or failure of the cmpxchg(),
+	 * a relaxed version with explicit barrier is used. The control
+	 * dependency will order the reading of node->state before any
+	 * subsequent writes.
+	 */
+	smp_mb__before_atomic();
+	if (cmpxchg_relaxed(&node->state, vcpu_halted, vcpu_hashed)
+	    != vcpu_halted)
+		return;
+
+	/*
+	 * Put the lock into the hash table and set the _Q_SLOW_VAL.
+	 *
+	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+	 * the hash table later on at unlock time, no atomic instruction is
+	 * needed.
+	 */
+	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
+	(void)pv_hash(lock, node);
+}
+
+/*
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
+ */
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct qnode *node)
+{
+	struct qspinlock **lp = NULL;
+	int waitcnt = 0;
+	int loop;
+
+	/*
+	 * If pv_kick_node() already advanced our state, we don't need to
+	 * insert ourselves into the hash table anymore.
+	 */
+	if (READ_ONCE(node->state) == vcpu_hashed)
+		lp = (struct qspinlock **)1;
+
+	/*
+	 * Tracking # of slowpath locking operations
+	 */
+	lockevent_inc(lock_slowpath);
+
+	for (;; waitcnt++) {
+		/*
+		 * Set correct vCPU state to be used by queue node wait-early
+		 * mechanism.
+		 */
+		WRITE_ONCE(node->state, vcpu_running);
+
+		/*
+		 * Set the pending bit in the active lock spinning loop to
+		 * disable lock stealing before attempting to acquire the lock.
+		 */
+		set_pending(lock);
+		for (loop = SPIN_THRESHOLD; loop; loop--) {
+			if (trylock_clear_pending(lock))
+				goto gotlock;
+			cpu_relax();
+		}
+		clear_pending(lock);
+
+
+		if (!lp) { /* ONCE */
+			lp = pv_hash(lock, node);
+
+			/*
+			 * We must hash before setting _Q_SLOW_VAL, such that
+			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+			 * we'll be sure to be able to observe our hash entry.
+			 *
+			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
+			 *       MB                           RMB
+			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
+			 *
+			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
+			 */
+			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
+				/*
+				 * The lock was free and now we own the lock.
+				 * Change the lock value back to _Q_LOCKED_VAL
+				 * and unhash the table.
+				 */
+				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+				WRITE_ONCE(*lp, NULL);
+				goto gotlock;
+			}
+		}
+		WRITE_ONCE(node->state, vcpu_hashed);
+		lockevent_inc(pv_wait_head);
+		lockevent_cond_inc(pv_wait_again, waitcnt);
+		pv_wait(&lock->locked, _Q_SLOW_VAL);
+
+		/*
+		 * Because of lock stealing, the queue head vCPU may not be
+		 * able to acquire the lock before it has to wait again.
+		 */
+	}
+
+	/*
+	 * The cmpxchg() or xchg() call before coming here provides the
+	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+	 * here is to indicate to the compiler that the value will always
+	 * be nozero to enable better code optimization.
+	 */
+gotlock:
+	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
+}
+
+/*
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
+ */
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
+{
+	struct qnode *node;
+
+	if (unlikely(locked != _Q_SLOW_VAL)) {
+		WARN(!debug_locks_silent,
+		     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+		     (unsigned long)lock, atomic_read(&lock->val));
+		return;
+	}
+
+	/*
+	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+	 * so we need a barrier to order the read of the node data in
+	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+	 *
+	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
+	 */
+	smp_rmb();
+
+	/*
+	 * Since the above failed to release, this must be the SLOW path.
+	 * Therefore start by looking up the blocked node and unhashing it.
+	 */
+	node = pv_unhash(lock);
+
+	/*
+	 * Now that we have a reference to the (likely) blocked qnode,
+	 * release the lock.
+	 */
+	smp_store_release(&lock->locked, 0);
+
+	/*
+	 * At this point the memory pointed at by lock can be freed/reused,
+	 * however we can still use the qnode to kick the CPU.
+	 * The other vCPU may not really be halted, but kicking an active
+	 * vCPU is harmless other than the additional latency in completing
+	 * the unlock.
+	 */
+	lockevent_inc(pv_kick_unlock);
+	pv_kick(node->cpu);
+}
+
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	u8 locked;
+
+	/*
+	 * We must not unlock if SLOW, because in that case we must first
+	 * unhash. Otherwise it would be possible to have multiple @lock
+	 * entries, which would be BAD.
+	 */
+	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
+	if (likely(locked == _Q_LOCKED_VAL))
+		return;
+
+	__pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif
+
 #else /* CONFIG_PARAVIRT_SPINLOCKS */
 static __always_inline void pv_init_node(struct qnode *node) { }
 static __always_inline void pv_wait_node(struct qnode *node,
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
deleted file mode 100644
index f1922e3a0f7d..000000000000
--- a/kernel/locking/qspinlock_paravirt.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/hash.h>
-#include <linux/memblock.h>
-#include <linux/debug_locks.h>
-
-/*
- * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
- * of spinning them.
- *
- * This relies on the architecture to provide two paravirt hypercalls:
- *
- *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
- *   pv_kick(cpu)             -- wakes a suspended vcpu
- *
- * Using these we implement __pv_queued_spin_lock_slowpath() and
- * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
- * native_queued_spin_unlock().
- */
-
-#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
-
-/*
- * Queue Node Adaptive Spinning
- *
- * A queue node vCPU will stop spinning if the vCPU in the previous node is
- * not running. The one lock stealing attempt allowed at slowpath entry
- * mitigates the slight slowdown for non-overcommitted guest with this
- * aggressive wait-early mechanism.
- *
- * The status of the previous node will be checked at fixed interval
- * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
- * pound on the cacheline of the previous node too heavily.
- */
-#define PV_PREV_CHECK_MASK	0xff
-
-/*
- * Queue node uses: vcpu_running & vcpu_halted.
- * Queue head uses: vcpu_running & vcpu_hashed.
- */
-enum vcpu_state {
-	vcpu_running = 0,
-	vcpu_halted,		/* Used only in pv_wait_node */
-	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
-};
-
-/*
- * Hybrid PV queued/unfair lock
- *
- * This function is called once when a lock waiter enters the PV slowpath
- * before being queued.
- *
- * The pending bit is set by the queue head vCPU of the MCS wait queue in
- * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
- * When that bit becomes visible to the incoming waiters, no lock stealing
- * is allowed. The function will return immediately to make the waiters
- * enter the MCS wait queue. So lock starvation shouldn't happen as long
- * as the queued mode vCPUs are actively running to set the pending bit
- * and hence disabling lock stealing.
- *
- * When the pending bit isn't set, the lock waiters will stay in the unfair
- * mode spinning on the lock unless the MCS wait queue is empty. In this
- * case, the lock waiters will enter the queued mode slowpath trying to
- * become the queue head and set the pending bit.
- *
- * This hybrid PV queued/unfair lock combines the best attributes of a
- * queued lock (no lock starvation) and an unfair lock (good performance
- * on not heavily contended locks).
- */
-static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
-{
-	/*
-	 * Stay in unfair lock mode as long as queued mode waiters are
-	 * present in the MCS wait queue but the pending bit isn't set.
-	 */
-	for (;;) {
-		int val = atomic_read(&lock->val);
-
-		if (!(val & _Q_LOCKED_PENDING_MASK) &&
-		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
-			lockevent_inc(pv_lock_stealing);
-			return true;
-		}
-		if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
-			break;
-
-		cpu_relax();
-	}
-
-	return false;
-}
-
-/*
- * Lock and MCS node addresses hash table for fast lookup
- *
- * Hashing is done on a per-cacheline basis to minimize the need to access
- * more than one cacheline.
- *
- * Dynamically allocate a hash table big enough to hold at least 4X the
- * number of possible cpus in the system. Allocation is done on page
- * granularity. So the minimum number of hash buckets should be at least
- * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
- *
- * Since we should not be holding locks from NMI context (very rare indeed) the
- * max load factor is 0.75, which is around the point where open addressing
- * breaks down.
- *
- */
-struct pv_hash_entry {
-	struct qspinlock *lock;
-	struct qnode   *node;
-};
-
-#define PV_HE_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
-#define PV_HE_MIN	(PAGE_SIZE / sizeof(struct pv_hash_entry))
-
-static struct pv_hash_entry *pv_lock_hash;
-static unsigned int pv_lock_hash_bits __read_mostly;
-
-/*
- * Allocate memory for the PV qspinlock hash buckets
- *
- * This function should be called from the paravirt spinlock initialization
- * routine.
- */
-void __init __pv_init_lock_hash(void)
-{
-	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
-
-	if (pv_hash_size < PV_HE_MIN)
-		pv_hash_size = PV_HE_MIN;
-
-	/*
-	 * Allocate space from bootmem which should be page-size aligned
-	 * and hence cacheline aligned.
-	 */
-	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
-					       sizeof(struct pv_hash_entry),
-					       pv_hash_size, 0,
-					       HASH_EARLY | HASH_ZERO,
-					       &pv_lock_hash_bits, NULL,
-					       pv_hash_size, pv_hash_size);
-}
-
-#define for_each_hash_entry(he, offset, hash)						\
-	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;	\
-	     offset < (1 << pv_lock_hash_bits);						\
-	     offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
-
-static struct qspinlock **pv_hash(struct qspinlock *lock, struct qnode *node)
-{
-	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
-	struct pv_hash_entry *he;
-	int hopcnt = 0;
-
-	for_each_hash_entry(he, offset, hash) {
-		hopcnt++;
-		if (!cmpxchg(&he->lock, NULL, lock)) {
-			WRITE_ONCE(he->node, node);
-			lockevent_pv_hop(hopcnt);
-			return &he->lock;
-		}
-	}
-	/*
-	 * Hard assume there is a free entry for us.
-	 *
-	 * This is guaranteed by ensuring every blocked lock only ever consumes
-	 * a single entry, and since we only have 4 nesting levels per CPU
-	 * and allocated 4*nr_possible_cpus(), this must be so.
-	 *
-	 * The single entry is guaranteed by having the lock owner unhash
-	 * before it releases.
-	 */
-	BUG();
-}
-
-static struct qnode *pv_unhash(struct qspinlock *lock)
-{
-	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
-	struct pv_hash_entry *he;
-	struct qnode *node;
-
-	for_each_hash_entry(he, offset, hash) {
-		if (READ_ONCE(he->lock) == lock) {
-			node = READ_ONCE(he->node);
-			WRITE_ONCE(he->lock, NULL);
-			return node;
-		}
-	}
-	/*
-	 * Hard assume we'll find an entry.
-	 *
-	 * This guarantees a limited lookup time and is itself guaranteed by
-	 * having the lock owner do the unhash -- IFF the unlock sees the
-	 * SLOW flag, there MUST be a hash entry.
-	 */
-	BUG();
-}
-
-/*
- * Return true if when it is time to check the previous node which is not
- * in a running state.
- */
-static inline bool
-pv_wait_early(struct qnode *prev, int loop)
-{
-	if ((loop & PV_PREV_CHECK_MASK) != 0)
-		return false;
-
-	return READ_ONCE(prev->state) != vcpu_running;
-}
-
-/*
- * Initialize the PV part of the qnode.
- */
-static void pv_init_node(struct qnode *node)
-{
-	node->cpu = smp_processor_id();
-	node->state = vcpu_running;
-}
-
-/*
- * Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
- * behalf.
- */
-static void pv_wait_node(struct qnode *node, struct qnode *prev)
-{
-	int loop;
-	bool wait_early;
-
-	for (;;) {
-		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
-			if (READ_ONCE(node->locked))
-				return;
-			if (pv_wait_early(prev, loop)) {
-				wait_early = true;
-				break;
-			}
-			cpu_relax();
-		}
-
-		/*
-		 * Order node->state vs node->locked thusly:
-		 *
-		 * [S] node->state = vcpu_halted  [S] next->locked = 1
-		 *     MB                             MB
-		 * [L] node->locked             [RmW] node->state = vcpu_hashed
-		 *
-		 * Matches the cmpxchg() from pv_kick_node().
-		 */
-		smp_store_mb(node->state, vcpu_halted);
-
-		if (!READ_ONCE(node->locked)) {
-			lockevent_inc(pv_wait_node);
-			lockevent_cond_inc(pv_wait_early, wait_early);
-			pv_wait(&node->state, vcpu_halted);
-		}
-
-		/*
-		 * If pv_kick_node() changed us to vcpu_hashed, retain that
-		 * value so that pv_wait_head_or_lock() knows to not also try
-		 * to hash this lock.
-		 */
-		cmpxchg(&node->state, vcpu_halted, vcpu_running);
-
-		/*
-		 * If the locked flag is still not set after wakeup, it is a
-		 * spurious wakeup and the vCPU should wait again. However,
-		 * there is a pretty high overhead for CPU halting and kicking.
-		 * So it is better to spin for a while in the hope that the
-		 * MCS lock will be released soon.
-		 */
-		lockevent_cond_inc(pv_spurious_wakeup,
-				  !READ_ONCE(node->locked));
-	}
-
-	/*
-	 * By now our node->locked should be 1 and our caller will not actually
-	 * spin-wait for it. We do however rely on our caller to do a
-	 * load-acquire for us.
-	 */
-}
-
-/*
- * Called after setting next->locked = 1 when we're the lock owner.
- *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state
- * such that they're waiting in pv_wait_head_or_lock(), this avoids a
- * wake/sleep cycle.
- */
-static void pv_kick_node(struct qspinlock *lock, struct qnode *node)
-{
-	/*
-	 * If the vCPU is indeed halted, advance its state to match that of
-	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
-	 * observe its next->locked value and advance itself.
-	 *
-	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
-	 *
-	 * The write to next->locked in arch_mcs_spin_unlock_contended()
-	 * must be ordered before the read of node->state in the cmpxchg()
-	 * below for the code to work correctly. To guarantee full ordering
-	 * irrespective of the success or failure of the cmpxchg(),
-	 * a relaxed version with explicit barrier is used. The control
-	 * dependency will order the reading of node->state before any
-	 * subsequent writes.
-	 */
-	smp_mb__before_atomic();
-	if (cmpxchg_relaxed(&node->state, vcpu_halted, vcpu_hashed)
-	    != vcpu_halted)
-		return;
-
-	/*
-	 * Put the lock into the hash table and set the _Q_SLOW_VAL.
-	 *
-	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
-	 * the hash table later on at unlock time, no atomic instruction is
-	 * needed.
-	 */
-	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
-	(void)pv_hash(lock, node);
-}
-
-/*
- * Wait for l->locked to become clear and acquire the lock;
- * halt the vcpu after a short spin.
- * __pv_queued_spin_unlock() will wake us.
- *
- * The current value of the lock will be returned for additional processing.
- */
-static u32
-pv_wait_head_or_lock(struct qspinlock *lock, struct qnode *node)
-{
-	struct qspinlock **lp = NULL;
-	int waitcnt = 0;
-	int loop;
-
-	/*
-	 * If pv_kick_node() already advanced our state, we don't need to
-	 * insert ourselves into the hash table anymore.
-	 */
-	if (READ_ONCE(node->state) == vcpu_hashed)
-		lp = (struct qspinlock **)1;
-
-	/*
-	 * Tracking # of slowpath locking operations
-	 */
-	lockevent_inc(lock_slowpath);
-
-	for (;; waitcnt++) {
-		/*
-		 * Set correct vCPU state to be used by queue node wait-early
-		 * mechanism.
-		 */
-		WRITE_ONCE(node->state, vcpu_running);
-
-		/*
-		 * Set the pending bit in the active lock spinning loop to
-		 * disable lock stealing before attempting to acquire the lock.
-		 */
-		set_pending(lock);
-		for (loop = SPIN_THRESHOLD; loop; loop--) {
-			if (trylock_clear_pending(lock))
-				goto gotlock;
-			cpu_relax();
-		}
-		clear_pending(lock);
-
-
-		if (!lp) { /* ONCE */
-			lp = pv_hash(lock, node);
-
-			/*
-			 * We must hash before setting _Q_SLOW_VAL, such that
-			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
-			 * we'll be sure to be able to observe our hash entry.
-			 *
-			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
-			 *       MB                           RMB
-			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
-			 *
-			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
-			 */
-			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
-				/*
-				 * The lock was free and now we own the lock.
-				 * Change the lock value back to _Q_LOCKED_VAL
-				 * and unhash the table.
-				 */
-				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
-				WRITE_ONCE(*lp, NULL);
-				goto gotlock;
-			}
-		}
-		WRITE_ONCE(node->state, vcpu_hashed);
-		lockevent_inc(pv_wait_head);
-		lockevent_cond_inc(pv_wait_again, waitcnt);
-		pv_wait(&lock->locked, _Q_SLOW_VAL);
-
-		/*
-		 * Because of lock stealing, the queue head vCPU may not be
-		 * able to acquire the lock before it has to wait again.
-		 */
-	}
-
-	/*
-	 * The cmpxchg() or xchg() call before coming here provides the
-	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
-	 * here is to indicate to the compiler that the value will always
-	 * be nozero to enable better code optimization.
-	 */
-gotlock:
-	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
-}
-
-/*
- * PV versions of the unlock fastpath and slowpath functions to be used
- * instead of queued_spin_unlock().
- */
-__visible void
-__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
-{
-	struct qnode *node;
-
-	if (unlikely(locked != _Q_SLOW_VAL)) {
-		WARN(!debug_locks_silent,
-		     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
-		     (unsigned long)lock, atomic_read(&lock->val));
-		return;
-	}
-
-	/*
-	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
-	 * so we need a barrier to order the read of the node data in
-	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
-	 *
-	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
-	 */
-	smp_rmb();
-
-	/*
-	 * Since the above failed to release, this must be the SLOW path.
-	 * Therefore start by looking up the blocked node and unhashing it.
-	 */
-	node = pv_unhash(lock);
-
-	/*
-	 * Now that we have a reference to the (likely) blocked qnode,
-	 * release the lock.
-	 */
-	smp_store_release(&lock->locked, 0);
-
-	/*
-	 * At this point the memory pointed at by lock can be freed/reused,
-	 * however we can still use the qnode to kick the CPU.
-	 * The other vCPU may not really be halted, but kicking an active
-	 * vCPU is harmless other than the additional latency in completing
-	 * the unlock.
-	 */
-	lockevent_inc(pv_kick_unlock);
-	pv_kick(node->cpu);
-}
-
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
-#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
-{
-	u8 locked;
-
-	/*
-	 * We must not unlock if SLOW, because in that case we must first
-	 * unhash. Otherwise it would be possible to have multiple @lock
-	 * entries, which would be BAD.
-	 */
-	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
-	if (likely(locked == _Q_LOCKED_VAL))
-		return;
-
-	__pv_queued_spin_unlock_slowpath(lock, locked);
-}
-#endif /* __pv_queued_spin_unlock */
-- 
2.35.1

next prev parent reply	other threads:[~2022-07-04 14:39 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-04 14:38 [PATCH 00/13] locking/qspinlock: simplify code generation Nicholas Piggin
2022-07-04 14:38 ` [PATCH 01/13] locking/qspinlock: remove pv_node abstraction Nicholas Piggin
2022-07-06 23:23   ` Boqun Feng
2022-07-04 14:38 ` [PATCH 02/13] locking/qspinlock: inline mcs_spinlock functions into qspinlock Nicholas Piggin
2022-07-05 16:57   ` Peter Zijlstra
2022-07-12  0:06     ` Nicholas Piggin
2022-07-04 14:38 ` [PATCH 03/13] locking/qspinlock: split common mcs queueing code into its own function Nicholas Piggin
2022-07-05 17:01   ` Peter Zijlstra
2022-07-12  0:10     ` Nicholas Piggin
2022-07-04 14:38 ` [PATCH 04/13] locking/qspinlock: move pv lock word helpers into qspinlock.c Nicholas Piggin
2022-07-05 19:34   ` Waiman Long
2022-07-12  0:11     ` Nicholas Piggin
2022-07-04 14:38 ` [PATCH 05/13] locking/qspinlock: be less clever with the preprocessor Nicholas Piggin
2022-07-05 17:08   ` Peter Zijlstra
2022-07-12  0:29     ` Nicholas Piggin
2022-07-05 20:02   ` Waiman Long
2022-07-12  0:33     ` Nicholas Piggin
2022-07-04 14:38 ` Nicholas Piggin [this message]
2022-07-05 17:20   ` [PATCH 06/13] locking/qspinlock: merge qspinlock_paravirt.h into qspinlock.c Peter Zijlstra
2022-07-05 17:36     ` Peter Zijlstra
2022-07-12  0:46       ` Nicholas Piggin
2022-07-06 13:35     ` Waiman Long
2022-07-06 14:16       ` Peter Zijlstra
2022-07-04 14:38 ` [PATCH 07/13] locking/qspinlock: remove arch qspinlock_paravirt.h includes Nicholas Piggin
2022-07-04 14:38 ` [PATCH 08/13] locking/qspinlock: stop renaming queued_spin_lock_slowpath to native_queued_spin_lock_slowpath Nicholas Piggin
2022-07-05 17:28   ` Peter Zijlstra
2022-07-04 14:38 ` [PATCH 09/13] locking/qspinlock: rename __pv_init_lock_hash to pv_spinlocks_init Nicholas Piggin
2022-07-04 14:38 ` [PATCH 10/13] locking/qspinlock: paravirt use simple trylock in case idx overflows Nicholas Piggin
2022-07-04 14:38 ` [PATCH 11/13] locking/qspinlock: Use queued_spin_trylock in pv_hybrid_queued_unfair_trylock Nicholas Piggin
2022-07-05 17:31   ` Peter Zijlstra
2022-07-05 20:15   ` Waiman Long
2022-07-12  0:48     ` Nicholas Piggin
2022-07-04 14:38 ` [PATCH 12/13] locking/qspinlock: separate pv_wait_node from the non-paravirt path Nicholas Piggin
2022-07-05 17:34   ` Peter Zijlstra
2022-07-12  0:50     ` Nicholas Piggin
2022-07-04 14:38 ` [PATCH 13/13] locking/qspinlock: simplify pv_wait_head_or_lock calling scheme Nicholas Piggin
2022-07-05 17:59 ` [PATCH 00/13] locking/qspinlock: simplify code generation Peter Zijlstra
2022-07-12  0:56   ` Nicholas Piggin

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b96c58ca51d dfblob:9a235b0d98c dfblob:f1922e3a0f7 )
 OR (
bs:"[PATCH 06/13] locking/qspinlock: merge qspinlock_paravirt.h into qspinlock.c" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220704143820.3071004-7-npiggin@gmail.com \
    --to=npiggin@gmail.com \
    --cc=boqun.feng@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.