From mboxrd@z Thu Jan 1 00:00:00 1970 From: catalin.marinas@arm.com (Catalin Marinas) Date: Tue, 21 Jul 2015 17:53:39 +0100 Subject: [PATCH 07/18] arm64: locks: patch in lse instructions when supported by the CPU In-Reply-To: <1436779519-2232-8-git-send-email-will.deacon@arm.com> References: <1436779519-2232-1-git-send-email-will.deacon@arm.com> <1436779519-2232-8-git-send-email-will.deacon@arm.com> Message-ID: <20150721165338.GE7250@e104818-lin.cambridge.arm.com> To: linux-arm-kernel@lists.infradead.org List-Id: linux-arm-kernel.lists.infradead.org On Mon, Jul 13, 2015 at 10:25:08AM +0100, Will Deacon wrote: > @@ -67,15 +78,25 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) > unsigned int tmp; > arch_spinlock_t lockval; > > - asm volatile( > -" prfm pstl1strm, %2\n" > -"1: ldaxr %w0, %2\n" > -" eor %w1, %w0, %w0, ror #16\n" > -" cbnz %w1, 2f\n" > -" add %w0, %w0, %3\n" > -" stxr %w1, %w0, %2\n" > -" cbnz %w1, 1b\n" > -"2:" > + asm volatile(ARM64_LSE_ATOMIC_INSN( > + /* LL/SC */ > + " prfm pstl1strm, %2\n" > + "1: ldaxr %w0, %2\n" > + " eor %w1, %w0, %w0, ror #16\n" > + " cbnz %w1, 2f\n" > + " add %w0, %w0, %3\n" > + " stxr %w1, %w0, %2\n" > + " cbnz %w1, 1b\n" > + "2:", > + /* LSE atomics */ > + " ldar %w0, %2\n" Do we still need an acquire if we fail to take the lock? > + " eor %w1, %w0, %w0, ror #16\n" > + " cbnz %w1, 1f\n" > + " add %w1, %w0, %3\n" > + " casa %w0, %w1, %2\n" > + " and %w1, %w1, #0xffff\n" > + " eor %w1, %w1, %w0, lsr #16\n" > + "1:") > : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) > : "I" (1 << TICKET_SHIFT) > : "memory"); I wonder if this is any faster with LSE. CAS would have to re-load the lock but we already have the value loaded (though most likely the reload will be from cache and we save a cbnz that can be mispredicted). I guess we'll re-test when we get some real hardware. Does prfm help in any way with the LDAR? > @@ -85,10 +106,19 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) > > static inline void arch_spin_unlock(arch_spinlock_t *lock) > { > - asm volatile( > -" stlrh %w1, %0\n" > - : "=Q" (lock->owner) > - : "r" (lock->owner + 1) > + unsigned long tmp; > + > + asm volatile(ARM64_LSE_ATOMIC_INSN( > + /* LL/SC */ > + " ldr %w1, %0\n" > + " add %w1, %w1, #1\n" > + " stlrh %w1, %0", > + /* LSE atomics */ > + " mov %w1, #1\n" > + " nop\n" > + " staddlh %w1, %0") > + : "=Q" (lock->owner), "=&r" (tmp) > + : > : "memory"); > } > > @@ -125,11 +155,19 @@ static inline void arch_write_lock(arch_rwlock_t *rw) > > asm volatile( > " sevl\n" > + ARM64_LSE_ATOMIC_INSN( > + /* LL/SC */ > "1: wfe\n" > "2: ldaxr %w0, %1\n" > " cbnz %w0, 1b\n" > " stxr %w0, %w2, %1\n" > - " cbnz %w0, 2b\n" > + " cbnz %w0, 2b", > + /* LSE atomics */ > + "1: wfe\n" > + " mov %w0, wzr\n" > + " casa %w0, %w2, %1\n" > + " nop\n" > + " cbnz %w0, 1b") > : "=&r" (tmp), "+Q" (rw->lock) > : "r" (0x80000000) > : "memory"); With WFE in the LL/SC case, we rely on LDAXR to set the exclusive monitor and an event would be generated every time it gets cleared. With CAS, we no longer have this behaviour, so what guarantees a SEV? > @@ -139,11 +177,16 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) > { > unsigned int tmp; > > - asm volatile( > + asm volatile(ARM64_LSE_ATOMIC_INSN( > + /* LL/SC */ > " ldaxr %w0, %1\n" > " cbnz %w0, 1f\n" > " stxr %w0, %w2, %1\n" Not a comment to your patch but to the original code: don't we need a branch back to ldaxr if stxr fails, like we do for arch_spin_trylock? The same comment for arch_read_trylock. > - "1:\n" > + "1:", > + /* LSE atomics */ > + " mov %w0, wzr\n" > + " casa %w0, %w2, %1\n" > + " nop") > : "=&r" (tmp), "+Q" (rw->lock) > : "r" (0x80000000) > : "memory"); > @@ -153,9 +196,10 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) > > static inline void arch_write_unlock(arch_rwlock_t *rw) > { > - asm volatile( > - " stlr %w1, %0\n" > - : "=Q" (rw->lock) : "r" (0) : "memory"); > + asm volatile(ARM64_LSE_ATOMIC_INSN( > + " stlr wzr, %0", > + " swpl wzr, wzr, %0") > + : "=Q" (rw->lock) :: "memory"); Is this any better than just STLR? We don't need the memory read. > @@ -172,6 +216,10 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) > * > * The memory barriers are implicit with the load-acquire and store-release > * instructions. > + * > + * Note that in UNDEFINED cases, such as unlocking a lock twice, the LL/SC > + * and LSE implementations may exhibit different behaviour (although this > + * will have no effect on lockdep). > */ > static inline void arch_read_lock(arch_rwlock_t *rw) > { > @@ -179,26 +227,43 @@ static inline void arch_read_lock(arch_rwlock_t *rw) > > asm volatile( > " sevl\n" > + ARM64_LSE_ATOMIC_INSN( > + /* LL/SC */ > "1: wfe\n" > "2: ldaxr %w0, %2\n" > " add %w0, %w0, #1\n" > " tbnz %w0, #31, 1b\n" > " stxr %w1, %w0, %2\n" > - " cbnz %w1, 2b\n" > + " nop\n" > + " cbnz %w1, 2b", > + /* LSE atomics */ > + "1: wfe\n" > + "2: ldr %w0, %2\n" > + " adds %w1, %w0, #1\n" > + " tbnz %w1, #31, 1b\n" > + " casa %w0, %w1, %2\n" > + " sbc %w0, %w1, %w0\n" > + " cbnz %w0, 2b") > : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) > : > - : "memory"); > + : "cc", "memory"); Same comment here about WFE. -- Catalin