[PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics)

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics)
@ 2013-07-19 15:19 Ian Campbell
  2013-07-19 15:20 ` [PATCH 1/3] xen/arm64: Assembly optimized bitops from Linux Ian Campbell
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Ian Campbell @ 2013-07-19 15:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Julien Grall, Stefano Stabellini, Tim Deegan

The following patches update and resync some of the assembly primitives
which we got from Linux.

The first two have been posted independently before, the last one is
new.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/3] xen/arm64: Assembly optimized bitops from Linux
  2013-07-19 15:19 [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
@ 2013-07-19 15:20 ` Ian Campbell
  2013-07-19 15:20 ` [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux Ian Campbell
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Ian Campbell @ 2013-07-19 15:20 UTC (permalink / raw)
  To: xen-devel; +Cc: julien.grall, tim, Ian Campbell, stefano.stabellini

This patch replaces the previous hashed lock implementaiton of bitops with
assembly optimized ones taken from Linux v3.10-rc4.

The Linux derived ASM only supports 8 byte aligned bitmaps (which under Linux
are unsigned long * rather than our void *). We do have actually uses of 4
byte alignment (i.e. the bitmaps in struct xmem_pool) which trigger alignment
faults.

Therefore adjust the assembly to work in 4 byte increments, which involved:
    - bit offset now bits 4:0 => mask #31 not #63
    - use wN register not xN for load/modify/store loop.

There is no need to adjust the shift used to calculate the word offset, the
difference is already acounted for in the #63->#31 change.

NB: Xen's build system cannot cope with the change from .c to .S file,
remove xen/arch/arm/arm64/lib/.bitops.o.d or clean your build tree.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Tim Deegan <tim@xen.org>
---
v2: No need to adjust the word offset shift size.
---
 xen/arch/arm/arm64/lib/bitops.S    |   68 ++++++++++++
 xen/arch/arm/arm64/lib/bitops.c    |   22 ----
 xen/include/asm-arm/arm64/bitops.h |  203 ++----------------------------------
 3 files changed, 76 insertions(+), 217 deletions(-)
 create mode 100644 xen/arch/arm/arm64/lib/bitops.S
 delete mode 100644 xen/arch/arm/arm64/lib/bitops.c

diff --git a/xen/arch/arm/arm64/lib/bitops.S b/xen/arch/arm/arm64/lib/bitops.S
new file mode 100644
index 0000000..80cc903
--- /dev/null
+++ b/xen/arch/arm/arm64/lib/bitops.S
@@ -0,0 +1,68 @@
+/*
+ * Based on linux/arch/arm64/lib/bitops.h which in turn is
+ * Based on arch/arm/lib/bitops.h
+ *
+ * Copyright (C) 2013 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/config.h>
+
+/*
+ * x0: bits 4:0  bit offset
+ *     bits 31:5 word offset
+ * x1: address
+ */
+	.macro	bitop, name, instr
+ENTRY(	\name	)
+	and	w3, w0, #31		// Get bit offset
+	eor	w0, w0, w3		// Clear low bits
+	mov	x2, #1
+	add	x1, x1, x0, lsr #3	// Get word offset
+	lsl	x3, x2, x3		// Create mask
+1:	ldxr	w2, [x1]
+	\instr	w2, w2, w3
+	stxr	w0, w2, [x1]
+	cbnz	w0, 1b
+	ret
+ENDPROC(\name	)
+	.endm
+
+	.macro	testop, name, instr
+ENTRY(	\name	)
+	and	w3, w0, #31		// Get bit offset
+	eor	w0, w0, w3		// Clear low bits
+	mov	x2, #1
+	add	x1, x1, x0, lsr #3	// Get word offset
+	lsl	x4, x2, x3		// Create mask
+1:	ldaxr	w2, [x1]
+	lsr	w0, w2, w3		// Save old value of bit
+	\instr	w2, w2, w4		// toggle bit
+	stlxr	w5, w2, [x1]
+	cbnz	w5, 1b
+	and	w0, w0, #1
+3:	ret
+ENDPROC(\name	)
+	.endm
+
+/*
+ * Atomic bit operations.
+ */
+	bitop	change_bit, eor
+	bitop	clear_bit, bic
+	bitop	set_bit, orr
+
+	testop	test_and_change_bit, eor
+	testop	test_and_clear_bit, bic
+	testop	test_and_set_bit, orr
diff --git a/xen/arch/arm/arm64/lib/bitops.c b/xen/arch/arm/arm64/lib/bitops.c
deleted file mode 100644
index 02d8d78..0000000
--- a/xen/arch/arm/arm64/lib/bitops.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Limited
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <xen/spinlock.h>
-#include <xen/bitops.h>
-
-spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] /*__lock_aligned*/ = {
-       [0 ... (ATOMIC_HASH_SIZE-1)]  = SPIN_LOCK_UNLOCKED
-};
diff --git a/xen/include/asm-arm/arm64/bitops.h b/xen/include/asm-arm/arm64/bitops.h
index 0a6eba3..b43931d 100644
--- a/xen/include/asm-arm/arm64/bitops.h
+++ b/xen/include/asm-arm/arm64/bitops.h
@@ -1,200 +1,15 @@
 #ifndef _ARM_ARM64_BITOPS_H
 #define _ARM_ARM64_BITOPS_H
 
-/* Generic bitop support. Based on linux/include/asm-generic/bitops/atomic.h */
-
-#include <xen/spinlock.h>
-#include <xen/cache.h>          /* we use L1_CACHE_BYTES */
-
-/* Use an array of spinlocks for our atomic_ts.
- * Hash function to index into a different SPINLOCK.
- * Since "a" is usually an address, use one spinlock per cacheline.
- */
-#  define ATOMIC_HASH_SIZE 4
-#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
-
-extern spinlock_t __atomic_hash[ATOMIC_HASH_SIZE]/* __lock_aligned*/;
-
-#define _atomic_spin_lock_irqsave(l,f) do {     \
-       spinlock_t *s = ATOMIC_HASH(l);          \
-       spin_lock_irqsave(s, f);\
-} while(0)
-
-#define _atomic_spin_unlock_irqrestore(l,f) do {\
-        spinlock_t *s = ATOMIC_HASH(l);         \
-        spin_unlock_irqrestore(s,f);		\
-} while(0)
-
-#define FIXUP(_p, _mask)                        \
-    {                                           \
-        unsigned long __p = (unsigned long)_p;  \
-        if (__p & 0x7) {                        \
-            if (_mask > 0xffffffff) {           \
-             __p = (__p+32)&~0x7; _mask >>=32;  \
-            } else {                            \
-                __p &= ~0x7; _mask <<= 32;      \
-            }                                   \
-            if (0)printk("BITOPS: Fixup misaligned ptr %p => %#lx\n", _p, __p); \
-            _p = (void *)__p;                   \
-        }                                       \
-    }
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writing portable code,
- * make sure not to rely on its reordering guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-
-static inline void set_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
-
-        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
-        //       nr, addr, mask, p, ATOMIC_HASH(p));
-        FIXUP(p, mask);
-        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
-        //       nr, addr, mask, p, ATOMIC_HASH(p));
-        //printk("before *p is %#lx\n", *p);
-	_atomic_spin_lock_irqsave(p, flags);
-	*p  |= mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-        //printk(" after *p is %#lx\n", *p);
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	*p &= ~mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-}
-
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered. It may be
- * reordered on other architectures than x86.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	*p ^= mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It may be reordered on other architectures than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old | mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-
-	return (old & mask) != 0;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It can be reorderdered on other architectures other than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old & ~mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-
-	return (old & mask) != 0;
-}
-
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
+/*
+ * Little endian assembly atomic bitops.
  */
-static inline int test_and_change_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old ^ mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-
-	return (old & mask) != 0;
-}
+extern void set_bit(int nr, volatile void *p);
+extern void clear_bit(int nr, volatile void *p);
+extern void change_bit(int nr, volatile void *p);
+extern int test_and_set_bit(int nr, volatile void *p);
+extern int test_and_clear_bit(int nr, volatile void *p);
+extern int test_and_change_bit(int nr, volatile void *p);
 
 /* Based on linux/include/asm-generic/bitops/builtin-__ffs.h */
 /**
@@ -217,8 +32,6 @@ static /*__*/always_inline unsigned long __ffs(unsigned long word)
  */
 #define ffz(x)  __ffs(~(x))
 
-
-
 /* Based on linux/include/asm-generic/bitops/find.h */
 
 #ifndef find_next_bit
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-19 15:19 [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
  2013-07-19 15:20 ` [PATCH 1/3] xen/arm64: Assembly optimized bitops from Linux Ian Campbell
@ 2013-07-19 15:20 ` Ian Campbell
  2013-07-29 16:02   ` Tim Deegan
  2013-07-19 15:20 ` [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock Ian Campbell
  2013-08-22 14:56 ` [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
  3 siblings, 1 reply; 16+ messages in thread
From: Ian Campbell @ 2013-07-19 15:20 UTC (permalink / raw)
  To: xen-devel; +Cc: julien.grall, tim, Ian Campbell, stefano.stabellini

This picks up the changes from Linux commit 3a0310eb369a:
    arm64: atomics: fix grossly inconsistent asm constraints for exclusives

    Our uses of inline asm constraints for atomic operations are fairly
    wild and varied. We basically need to guarantee the following:

      1. Any instructions with barrier implications
         (load-acquire/store-release) have a "memory" clobber

      2. When performing exclusive accesses, the addresing mode is generated
         using the "Q" constraint

      3. Atomic blocks which use the condition flags, have a "cc" clobber

    This patch addresses these concerns which, as well as fixing the
    semantics of the code, stops GCC complaining about impossible asm
    constraints.

    Signed-off-by: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 xen/include/asm-arm/arm64/atomic.h   |   66 +++++++++++++++---------------
 xen/include/asm-arm/arm64/spinlock.h |   48 +++++++++++-----------
 xen/include/asm-arm/arm64/system.h   |   74 +++++++++++++++++-----------------
 3 files changed, 94 insertions(+), 94 deletions(-)

diff --git a/xen/include/asm-arm/arm64/atomic.h b/xen/include/asm-arm/arm64/atomic.h
index 5e4ffed..a279755 100644
--- a/xen/include/asm-arm/arm64/atomic.h
+++ b/xen/include/asm-arm/arm64/atomic.h
@@ -33,12 +33,12 @@ static inline void atomic_add(int i, atomic_t *v)
 	int result;
 
 	asm volatile("// atomic_add\n"
-"1:	ldxr	%w0, [%3]\n"
-"	add	%w0, %w0, %w4\n"
-"	stxr	%w1, %w0, [%3]\n"
+"1:	ldxr	%w0, %2\n"
+"	add	%w0, %w0, %w3\n"
+"	stxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
 	: "cc");
 }
 
@@ -48,13 +48,13 @@ static inline int atomic_add_return(int i, atomic_t *v)
 	int result;
 
 	asm volatile("// atomic_add_return\n"
-"1:	ldaxr	%w0, [%3]\n"
-"	add	%w0, %w0, %w4\n"
-"	stlxr	%w1, %w0, [%3]\n"
+"1:	ldaxr	%w0, %2\n"
+"	add	%w0, %w0, %w3\n"
+"	stlxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
+	: "cc", "memory");
 
 	return result;
 }
@@ -65,12 +65,12 @@ static inline void atomic_sub(int i, atomic_t *v)
 	int result;
 
 	asm volatile("// atomic_sub\n"
-"1:	ldxr	%w0, [%3]\n"
-"	sub	%w0, %w0, %w4\n"
-"	stxr	%w1, %w0, [%3]\n"
+"1:	ldxr	%w0, %2\n"
+"	sub	%w0, %w0, %w3\n"
+"	stxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
 	: "cc");
 }
 
@@ -80,13 +80,13 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 	int result;
 
 	asm volatile("// atomic_sub_return\n"
-"1:	ldaxr	%w0, [%3]\n"
-"	sub	%w0, %w0, %w4\n"
-"	stlxr	%w1, %w0, [%3]\n"
+"1:	ldaxr	%w0, %2\n"
+"	sub	%w0, %w0, %w3\n"
+"	stlxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
+	: "cc", "memory");
 
 	return result;
 }
@@ -97,15 +97,15 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 	int oldval;
 
 	asm volatile("// atomic_cmpxchg\n"
-"1:	ldaxr	%w1, [%3]\n"
-"	cmp	%w1, %w4\n"
+"1:	ldaxr	%w1, %2\n"
+"	cmp	%w1, %w3\n"
 "	b.ne	2f\n"
-"	stlxr	%w0, %w5, [%3]\n"
+"	stlxr	%w0, %w4, %2\n"
 "	cbnz	%w0, 1b\n"
 "2:"
-	: "=&r" (tmp), "=&r" (oldval), "+o" (ptr->counter)
-	: "r" (&ptr->counter), "Ir" (old), "r" (new)
-	: "cc");
+	: "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter)
+	: "Ir" (old), "r" (new)
+	: "cc", "memory");
 
 	return oldval;
 }
@@ -115,12 +115,12 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
 	unsigned long tmp, tmp2;
 
 	asm volatile("// atomic_clear_mask\n"
-"1:	ldxr	%0, [%3]\n"
-"	bic	%0, %0, %4\n"
-"	stxr	%w1, %0, [%3]\n"
+"1:	ldxr	%0, %2\n"
+"	bic	%0, %0, %3\n"
+"	stxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (tmp), "=&r" (tmp2), "+o" (*addr)
-	: "r" (addr), "Ir" (mask)
+	: "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr)
+	: "Ir" (mask)
 	: "cc");
 }
 
diff --git a/xen/include/asm-arm/arm64/spinlock.h b/xen/include/asm-arm/arm64/spinlock.h
index fe4c403..717f2fe 100644
--- a/xen/include/asm-arm/arm64/spinlock.h
+++ b/xen/include/asm-arm/arm64/spinlock.h
@@ -31,8 +31,8 @@ static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
     ASSERT(_raw_spin_is_locked(lock));
 
     asm volatile(
-        "       stlr    %w1, [%0]\n"
-        : : "r" (&lock->lock), "r" (0) : "memory");
+        "       stlr    %w1, %0\n"
+        : "=Q" (lock->lock) : "r" (0) : "memory");
 }
 
 static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
@@ -40,13 +40,13 @@ static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
     unsigned int tmp;
 
     asm volatile(
-        "       ldaxr   %w0, [%1]\n"
+        "       ldaxr   %w0, %1\n"
         "       cbnz    %w0, 1f\n"
-        "       stxr    %w0, %w2, [%1]\n"
+        "       stxr    %w0, %w2, %1\n"
         "1:\n"
-        : "=&r" (tmp)
-        : "r" (&lock->lock), "r" (1)
-        : "memory");
+        : "=&r" (tmp), "+Q" (lock->lock)
+        : "r" (1)
+        : "cc", "memory");
 
     return !tmp;
 }
@@ -62,14 +62,14 @@ static always_inline int _raw_read_trylock(raw_rwlock_t *rw)
     unsigned int tmp, tmp2 = 1;
 
     asm volatile(
-        "       ldaxr   %w0, [%2]\n"
+        "       ldaxr   %w0, %2\n"
         "       add     %w0, %w0, #1\n"
         "       tbnz    %w0, #31, 1f\n"
-        "       stxr    %w1, %w0, [%2]\n"
+        "       stxr    %w1, %w0, %2\n"
         "1:\n"
-        : "=&r" (tmp), "+r" (tmp2)
-        : "r" (&rw->lock)
-        : "memory");
+        : "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock)
+        :
+        : "cc", "memory");
 
     return !tmp2;
 }
@@ -79,13 +79,13 @@ static always_inline int _raw_write_trylock(raw_rwlock_t *rw)
     unsigned int tmp;
 
     asm volatile(
-        "       ldaxr   %w0, [%1]\n"
+        "       ldaxr   %w0, %1\n"
         "       cbnz    %w0, 1f\n"
-        "       stxr    %w0, %w2, [%1]\n"
+        "       stxr    %w0, %w2, %1\n"
         "1:\n"
-        : "=&r" (tmp)
-        : "r" (&rw->lock), "r" (0x80000000)
-        : "memory");
+        : "=&r" (tmp), "+Q" (rw->lock)
+        : "r" (0x80000000)
+        : "cc", "memory");
 
     return !tmp;
 }
@@ -95,20 +95,20 @@ static inline void _raw_read_unlock(raw_rwlock_t *rw)
     unsigned int tmp, tmp2;
 
     asm volatile(
-        "1:     ldxr    %w0, [%2]\n"
+        "    1: ldxr    %w0, %2\n"
         "       sub     %w0, %w0, #1\n"
-        "       stlxr   %w1, %w0, [%2]\n"
+        "       stlxr   %w1, %w0, %2\n"
         "       cbnz    %w1, 1b\n"
-        : "=&r" (tmp), "=&r" (tmp2)
-        : "r" (&rw->lock)
-        : "memory");
+        : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
+        :
+        : "cc", "memory");
 }
 
 static inline void _raw_write_unlock(raw_rwlock_t *rw)
 {
     asm volatile(
-        "       stlr    %w1, [%0]\n"
-        : : "r" (&rw->lock), "r" (0) : "memory");
+        "       stlr    %w1, %0\n"
+        : "=Q" (rw->lock) : "r" (0) : "memory");
 }
 
 #define _raw_rw_is_locked(x) ((x)->lock != 0)
diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h
index 4e41913..d7e912f 100644
--- a/xen/include/asm-arm/arm64/system.h
+++ b/xen/include/asm-arm/arm64/system.h
@@ -28,39 +28,39 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
         switch (size) {
         case 1:
                 asm volatile("//        __xchg1\n"
-                "1:     ldaxrb  %w0, [%3]\n"
-                "       stlxrb  %w1, %w2, [%3]\n"
+                "1:     ldaxrb  %w0, %2\n"
+                "       stlxrb  %w1, %w3, %2\n"
                 "       cbnz    %w1, 1b\n"
-                        : "=&r" (ret), "=&r" (tmp)
-                        : "r" (x), "r" (ptr)
-                        : "memory", "cc");
+                        : "=&r" (ret), "=&r" (tmp), "+Q" (*(u8 *)ptr)
+                        : "r" (x)
+                        : "cc", "memory");
                 break;
         case 2:
                 asm volatile("//        __xchg2\n"
-                "1:     ldaxrh  %w0, [%3]\n"
-                "       stlxrh  %w1, %w2, [%3]\n"
+                "1:     ldaxrh  %w0, %2\n"
+                "       stlxrh  %w1, %w3, %2\n"
                 "       cbnz    %w1, 1b\n"
-                        : "=&r" (ret), "=&r" (tmp)
-                        : "r" (x), "r" (ptr)
-                        : "memory", "cc");
+                        : "=&r" (ret), "=&r" (tmp), "+Q" (*(u16 *)ptr)
+                        : "r" (x)
+                        : "cc", "memory");
                 break;
         case 4:
                 asm volatile("//        __xchg4\n"
-                "1:     ldaxr   %w0, [%3]\n"
-                "       stlxr   %w1, %w2, [%3]\n"
+                "1:     ldaxr   %w0, %2\n"
+                "       stlxr   %w1, %w3, %2\n"
                 "       cbnz    %w1, 1b\n"
-                        : "=&r" (ret), "=&r" (tmp)
-                        : "r" (x), "r" (ptr)
-                        : "memory", "cc");
+                        : "=&r" (ret), "=&r" (tmp), "+Q" (*(u32 *)ptr)
+                        : "r" (x)
+                        : "cc", "memory");
                 break;
         case 8:
                 asm volatile("//        __xchg8\n"
-                "1:     ldaxr   %0, [%3]\n"
-                "       stlxr   %w1, %2, [%3]\n"
+                "1:     ldaxr   %0, %2\n"
+                "       stlxr   %w1, %3, %2\n"
                 "       cbnz    %w1, 1b\n"
-                        : "=&r" (ret), "=&r" (tmp)
-                        : "r" (x), "r" (ptr)
-                        : "memory", "cc");
+                        : "=&r" (ret), "=&r" (tmp), "+Q" (*(u64 *)ptr)
+                        : "r" (x)
+                        : "cc", "memory");
                 break;
         default:
                 __bad_xchg(ptr, size), ret = 0;
@@ -84,14 +84,14 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
         case 1:
                 do {
                         asm volatile("// __cmpxchg1\n"
-                        "       ldxrb   %w1, [%2]\n"
+                        "       ldxrb   %w1, %2\n"
                         "       mov     %w0, #0\n"
                         "       cmp     %w1, %w3\n"
                         "       b.ne    1f\n"
-                        "       stxrb   %w0, %w4, [%2]\n"
+                        "       stxrb   %w0, %w4, %2\n"
                         "1:\n"
-                                : "=&r" (res), "=&r" (oldval)
-                                : "r" (ptr), "Ir" (old), "r" (new)
+                                : "=&r" (res), "=&r" (oldval), "+Q" (*(u8 *)ptr)
+                                : "Ir" (old), "r" (new)
                                 : "cc");
                 } while (res);
                 break;
@@ -99,29 +99,29 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
         case 2:
                 do {
                         asm volatile("// __cmpxchg2\n"
-                        "       ldxrh   %w1, [%2]\n"
+                        "       ldxrh   %w1, %2\n"
                         "       mov     %w0, #0\n"
                         "       cmp     %w1, %w3\n"
                         "       b.ne    1f\n"
-                        "       stxrh   %w0, %w4, [%2]\n"
+                        "       stxrh   %w0, %w4, %2\n"
                         "1:\n"
-                                : "=&r" (res), "=&r" (oldval)
-                                : "r" (ptr), "Ir" (old), "r" (new)
-                                : "memory", "cc");
+                                : "=&r" (res), "=&r" (oldval), "+Q" (*(u16 *)ptr)
+                                : "Ir" (old), "r" (new)
+                                : "cc");
                 } while (res);
                 break;
 
         case 4:
                 do {
                         asm volatile("// __cmpxchg4\n"
-                        "       ldxr    %w1, [%2]\n"
+                        "       ldxr    %w1, %2\n"
                         "       mov     %w0, #0\n"
                         "       cmp     %w1, %w3\n"
                         "       b.ne    1f\n"
-                        "       stxr    %w0, %w4, [%2]\n"
+                        "       stxr    %w0, %w4, %2\n"
                         "1:\n"
-                                : "=&r" (res), "=&r" (oldval)
-                                : "r" (ptr), "Ir" (old), "r" (new)
+                                : "=&r" (res), "=&r" (oldval), "+Q" (*(u32 *)ptr)
+                                : "Ir" (old), "r" (new)
                                 : "cc");
                 } while (res);
                 break;
@@ -129,14 +129,14 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
         case 8:
                 do {
                         asm volatile("// __cmpxchg8\n"
-                        "       ldxr    %1, [%2]\n"
+                        "       ldxr    %1, %2\n"
                         "       mov     %w0, #0\n"
                         "       cmp     %1, %3\n"
                         "       b.ne    1f\n"
-                        "       stxr    %w0, %4, [%2]\n"
+                        "       stxr    %w0, %4, %2\n"
                         "1:\n"
-                                : "=&r" (res), "=&r" (oldval)
-                                : "r" (ptr), "Ir" (old), "r" (new)
+                                : "=&r" (res), "=&r" (oldval), "+Q" (*(u64 *)ptr)
+                                : "Ir" (old), "r" (new)
                                 : "cc");
                 } while (res);
                 break;
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock.
  2013-07-19 15:19 [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
  2013-07-19 15:20 ` [PATCH 1/3] xen/arm64: Assembly optimized bitops from Linux Ian Campbell
  2013-07-19 15:20 ` [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux Ian Campbell
@ 2013-07-19 15:20 ` Ian Campbell
  2013-07-29 15:52   ` Tim Deegan
  2013-08-22 14:56 ` [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
  3 siblings, 1 reply; 16+ messages in thread
From: Ian Campbell @ 2013-07-19 15:20 UTC (permalink / raw)
  To: xen-devel; +Cc: julien.grall, tim, Ian Campbell, stefano.stabellini

This comes from the Linux patches 15e7e5c1ebf5 for arm32 and 4ecf7ccb1973 for
arm64 by Will Deacon and Catalin Marinas respectively. The Linux commit message
says:

    An exclusive store instruction may fail for reasons other than lock
    contention (e.g. a cache eviction during the critical section) so, in
    line with other architectures using similar exclusive instructions
    (alpha, mips, powerpc), retry the trylock operation if the lock appears
    to be free but the strex reported failure.

I have observed this due to register_cpu_notifier containing:
    if ( !spin_trylock(&cpu_add_remove_lock) )
        BUG(); /* Should never fail as we are called only during boot. */
which was spuriously failing.

The ARMv8 variant is taken directly from the Linux patch. For v7 I had to
reimplement since we don't currently use ticket locks.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 xen/include/asm-arm/arm32/spinlock.h |   25 ++++++++++++++-----------
 xen/include/asm-arm/arm64/spinlock.h |    3 ++-
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/xen/include/asm-arm/arm32/spinlock.h b/xen/include/asm-arm/arm32/spinlock.h
index 4a11a97..ba11ad6 100644
--- a/xen/include/asm-arm/arm32/spinlock.h
+++ b/xen/include/asm-arm/arm32/spinlock.h
@@ -34,17 +34,20 @@ static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
 
 static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
 {
-    unsigned long tmp;
-
-    __asm__ __volatile__(
-"   ldrex   %0, [%1]\n"
-"   teq     %0, #0\n"
-"   strexeq %0, %2, [%1]"
-    : "=&r" (tmp)
-    : "r" (&lock->lock), "r" (1)
-    : "cc");
-
-    if (tmp == 0) {
+    unsigned long contended, res;
+
+    do {
+        __asm__ __volatile__(
+    "   ldrex   %0, [%2]\n"
+    "   teq     %0, #0\n"
+    "   strexeq %1, %3, [%2]\n"
+    "   movne   %1, #0\n"
+        : "=&r" (contended), "=r" (res)
+        : "r" (&lock->lock), "r" (1)
+        : "cc");
+    } while (res);
+
+    if (!contended) {
         smp_mb();
         return 1;
     } else {
diff --git a/xen/include/asm-arm/arm64/spinlock.h b/xen/include/asm-arm/arm64/spinlock.h
index 717f2fe..3a36cfd 100644
--- a/xen/include/asm-arm/arm64/spinlock.h
+++ b/xen/include/asm-arm/arm64/spinlock.h
@@ -40,9 +40,10 @@ static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
     unsigned int tmp;
 
     asm volatile(
-        "       ldaxr   %w0, %1\n"
+        "2:     ldaxr   %w0, %1\n"
         "       cbnz    %w0, 1f\n"
         "       stxr    %w0, %w2, %1\n"
+        "       cbnz    %w0, 2b\n"
         "1:\n"
         : "=&r" (tmp), "+Q" (lock->lock)
         : "r" (1)
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock.
  2013-07-19 15:20 ` [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock Ian Campbell
@ 2013-07-29 15:52   ` Tim Deegan
  2013-07-29 16:20     ` Ian Campbell
  0 siblings, 1 reply; 16+ messages in thread
From: Tim Deegan @ 2013-07-29 15:52 UTC (permalink / raw)
  To: Ian Campbell; +Cc: julien.grall, stefano.stabellini, xen-devel

At 16:20 +0100 on 19 Jul (1374250810), Ian Campbell wrote:
> This comes from the Linux patches 15e7e5c1ebf5 for arm32 and 4ecf7ccb1973 for
> arm64 by Will Deacon and Catalin Marinas respectively. The Linux commit message
> says:
> 
>     An exclusive store instruction may fail for reasons other than lock
>     contention (e.g. a cache eviction during the critical section) so, in
>     line with other architectures using similar exclusive instructions
>     (alpha, mips, powerpc), retry the trylock operation if the lock appears
>     to be free but the strex reported failure.
> 
> I have observed this due to register_cpu_notifier containing:
>     if ( !spin_trylock(&cpu_add_remove_lock) )
>         BUG(); /* Should never fail as we are called only during boot. */
> which was spuriously failing.
> 
> The ARMv8 variant is taken directly from the Linux patch. For v7 I had to
> reimplement since we don't currently use ticket locks.
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

Looks good, but:

>  static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
>  {
> -    unsigned long tmp;
> -
> -    __asm__ __volatile__(
> -"   ldrex   %0, [%1]\n"
> -"   teq     %0, #0\n"
> -"   strexeq %0, %2, [%1]"
> -    : "=&r" (tmp)
> -    : "r" (&lock->lock), "r" (1)
> -    : "cc");
> -
> -    if (tmp == 0) {
> +    unsigned long contended, res;
> +
> +    do {
> +        __asm__ __volatile__(
> +    "   ldrex   %0, [%2]\n"
> +    "   teq     %0, #0\n"
> +    "   strexeq %1, %3, [%2]\n"
> +    "   movne   %1, #0\n"
> +        : "=&r" (contended), "=r" (res)
> +        : "r" (&lock->lock), "r" (1)

Shouldn't this be using a 'Q' constraint for the lock, following the
pattern set in patch 2/3?

> +        : "cc");
> +    } while (res);
> +
> +    if (!contended) {
>          smp_mb();
>          return 1;
>      } else {

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-19 15:20 ` [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux Ian Campbell
@ 2013-07-29 16:02   ` Tim Deegan
  2013-07-29 16:13     ` Ian Campbell
  0 siblings, 1 reply; 16+ messages in thread
From: Tim Deegan @ 2013-07-29 16:02 UTC (permalink / raw)
  To: Ian Campbell; +Cc: julien.grall, stefano.stabellini, xen-devel

At 16:20 +0100 on 19 Jul (1374250809), Ian Campbell wrote:
> This picks up the changes from Linux commit 3a0310eb369a:
>     arm64: atomics: fix grossly inconsistent asm constraints for exclusives
> 
>     Our uses of inline asm constraints for atomic operations are fairly
>     wild and varied. We basically need to guarantee the following:
> 
>       1. Any instructions with barrier implications
>          (load-acquire/store-release) have a "memory" clobber
> 
>       2. When performing exclusive accesses, the addresing mode is generated
>          using the "Q" constraint
> 
>       3. Atomic blocks which use the condition flags, have a "cc" clobber
> 
>     This patch addresses these concerns which, as well as fixing the
>     semantics of the code, stops GCC complaining about impossible asm
>     constraints.
> 
>     Signed-off-by: Will Deacon <will.deacon@arm.com>
>     Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

In so far as this is just pulling in upstream fixes to code we copied
from linux, Acked-by: Tim Deegan <tim@xen.org>

But I don't understand the new 'memory' clobbers around in this patch.
Or rather, I don't understand why there are memory clobbers but not
DMBs.

The ARM ARM says: 

"The synchronization primitives follow the memory order model of the
 memory type accessed by the instructions. For this reason:
  - Portable code for claiming a spin-lock must include a Data
    Memory Barrier (DMB) operation, performed by a DMB instruction,
    between claiming the spin-lock and making any access that makes 
    use of the spin-lock.
  - Portable code for releasing a spin-lock must include a DMB
    instruction before writing to clear the spin-lock.
 This requirement applies to code using:
  - the Load-Exclusive/Store-Exclusive instruction pairs, for
    example LDREX/STREX
  - the deprecated synchronization primitives, SWP/SWPB."

Are the callers of these ops expected to put in the DMBs separately?

Tim.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-29 16:02   ` Tim Deegan
@ 2013-07-29 16:13     ` Ian Campbell
  2013-07-29 18:05       ` Will Deacon
  0 siblings, 1 reply; 16+ messages in thread
From: Ian Campbell @ 2013-07-29 16:13 UTC (permalink / raw)
  To: Tim Deegan
  Cc: julien.grall, Catalin Marinas, stefano.stabellini, Will Deacon,
	xen-devel

On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> At 16:20 +0100 on 19 Jul (1374250809), Ian Campbell wrote:
> > This picks up the changes from Linux commit 3a0310eb369a:
> >     arm64: atomics: fix grossly inconsistent asm constraints for exclusives
> > 
> >     Our uses of inline asm constraints for atomic operations are fairly
> >     wild and varied. We basically need to guarantee the following:
> > 
> >       1. Any instructions with barrier implications
> >          (load-acquire/store-release) have a "memory" clobber
> > 
> >       2. When performing exclusive accesses, the addresing mode is generated
> >          using the "Q" constraint
> > 
> >       3. Atomic blocks which use the condition flags, have a "cc" clobber
> > 
> >     This patch addresses these concerns which, as well as fixing the
> >     semantics of the code, stops GCC complaining about impossible asm
> >     constraints.
> > 
> >     Signed-off-by: Will Deacon <will.deacon@arm.com>
> >     Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> > 
> > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> 
> In so far as this is just pulling in upstream fixes to code we copied
> from linux, Acked-by: Tim Deegan <tim@xen.org>

Thanks.

> But I don't understand the new 'memory' clobbers around in this patch.
> Or rather, I don't understand why there are memory clobbers but not
> DMBs.

I've no idea, lets ask Will & Catalin (both CCd).

> 
> The ARM ARM says: 
> 
> "The synchronization primitives follow the memory order model of the
>  memory type accessed by the instructions. For this reason:
>   - Portable code for claiming a spin-lock must include a Data
>     Memory Barrier (DMB) operation, performed by a DMB instruction,
>     between claiming the spin-lock and making any access that makes 
>     use of the spin-lock.
>   - Portable code for releasing a spin-lock must include a DMB
>     instruction before writing to clear the spin-lock.
>  This requirement applies to code using:
>   - the Load-Exclusive/Store-Exclusive instruction pairs, for
>     example LDREX/STREX
>   - the deprecated synchronization primitives, SWP/SWPB."
> 
> Are the callers of these ops expected to put in the DMBs separately?

Given that this is an interface used from common code I really expect
not.

Ian.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock.
  2013-07-29 15:52   ` Tim Deegan
@ 2013-07-29 16:20     ` Ian Campbell
  2013-07-29 16:25       ` Tim Deegan
  0 siblings, 1 reply; 16+ messages in thread
From: Ian Campbell @ 2013-07-29 16:20 UTC (permalink / raw)
  To: Tim Deegan; +Cc: julien.grall, stefano.stabellini, xen-devel

On Mon, 2013-07-29 at 16:52 +0100, Tim Deegan wrote:
> At 16:20 +0100 on 19 Jul (1374250810), Ian Campbell wrote:
> > This comes from the Linux patches 15e7e5c1ebf5 for arm32 and 4ecf7ccb1973 for
> > arm64 by Will Deacon and Catalin Marinas respectively. The Linux commit message
> > says:
> > 
> >     An exclusive store instruction may fail for reasons other than lock
> >     contention (e.g. a cache eviction during the critical section) so, in
> >     line with other architectures using similar exclusive instructions
> >     (alpha, mips, powerpc), retry the trylock operation if the lock appears
> >     to be free but the strex reported failure.
> > 
> > I have observed this due to register_cpu_notifier containing:
> >     if ( !spin_trylock(&cpu_add_remove_lock) )
> >         BUG(); /* Should never fail as we are called only during boot. */
> > which was spuriously failing.
> > 
> > The ARMv8 variant is taken directly from the Linux patch. For v7 I had to
> > reimplement since we don't currently use ticket locks.
> > 
> > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> 
> Looks good, but:
> 
> >  static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
> >  {
> > -    unsigned long tmp;
> > -
> > -    __asm__ __volatile__(
> > -"   ldrex   %0, [%1]\n"
> > -"   teq     %0, #0\n"
> > -"   strexeq %0, %2, [%1]"
> > -    : "=&r" (tmp)
> > -    : "r" (&lock->lock), "r" (1)
> > -    : "cc");
> > -
> > -    if (tmp == 0) {
> > +    unsigned long contended, res;
> > +
> > +    do {
> > +        __asm__ __volatile__(
> > +    "   ldrex   %0, [%2]\n"
> > +    "   teq     %0, #0\n"
> > +    "   strexeq %1, %3, [%2]\n"
> > +    "   movne   %1, #0\n"
> > +        : "=&r" (contended), "=r" (res)
> > +        : "r" (&lock->lock), "r" (1)
> 
> Shouldn't this be using a 'Q' constraint for the lock, following the
> pattern set in patch 2/3?

That patch was arm64 specific while this is arm32 code. 'Q' is a machine
specific constraint which is at least worded differently for 32-vs-64 in
http://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints although I suppose they both read as the same thing.

I suppose the answer is that ldrex etc can take [rN, #imm] arguments,
which is what "Q" rather than "r" is trying to avoid, where as the newer
armv8 atomic instructions do not take a #imm (it is documented as "[rN,
#0]"), so you have to use Q there.

Ian.

> 
> > +        : "cc");
> > +    } while (res);
> > +
> > +    if (!contended) {
> >          smp_mb();
> >          return 1;
> >      } else {

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock.
  2013-07-29 16:20     ` Ian Campbell
@ 2013-07-29 16:25       ` Tim Deegan
  0 siblings, 0 replies; 16+ messages in thread
From: Tim Deegan @ 2013-07-29 16:25 UTC (permalink / raw)
  To: Ian Campbell; +Cc: julien.grall, xen-devel, stefano.stabellini

At 17:20 +0100 on 29 Jul (1375118421), Ian Campbell wrote:
> On Mon, 2013-07-29 at 16:52 +0100, Tim Deegan wrote:
> > At 16:20 +0100 on 19 Jul (1374250810), Ian Campbell wrote:
> > > This comes from the Linux patches 15e7e5c1ebf5 for arm32 and 4ecf7ccb1973 for
> > > arm64 by Will Deacon and Catalin Marinas respectively. The Linux commit message
> > > says:
> > > 
> > >     An exclusive store instruction may fail for reasons other than lock
> > >     contention (e.g. a cache eviction during the critical section) so, in
> > >     line with other architectures using similar exclusive instructions
> > >     (alpha, mips, powerpc), retry the trylock operation if the lock appears
> > >     to be free but the strex reported failure.
> > > 
> > > I have observed this due to register_cpu_notifier containing:
> > >     if ( !spin_trylock(&cpu_add_remove_lock) )
> > >         BUG(); /* Should never fail as we are called only during boot. */
> > > which was spuriously failing.
> > > 
> > > The ARMv8 variant is taken directly from the Linux patch. For v7 I had to
> > > reimplement since we don't currently use ticket locks.
> > > 
> > > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> > 
> > Looks good, but:
> > 
> > >  static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
> > >  {
> > > -    unsigned long tmp;
> > > -
> > > -    __asm__ __volatile__(
> > > -"   ldrex   %0, [%1]\n"
> > > -"   teq     %0, #0\n"
> > > -"   strexeq %0, %2, [%1]"
> > > -    : "=&r" (tmp)
> > > -    : "r" (&lock->lock), "r" (1)
> > > -    : "cc");
> > > -
> > > -    if (tmp == 0) {
> > > +    unsigned long contended, res;
> > > +
> > > +    do {
> > > +        __asm__ __volatile__(
> > > +    "   ldrex   %0, [%2]\n"
> > > +    "   teq     %0, #0\n"
> > > +    "   strexeq %1, %3, [%2]\n"
> > > +    "   movne   %1, #0\n"
> > > +        : "=&r" (contended), "=r" (res)
> > > +        : "r" (&lock->lock), "r" (1)
> > 
> > Shouldn't this be using a 'Q' constraint for the lock, following the
> > pattern set in patch 2/3?
> 
> That patch was arm64 specific while this is arm32 code. 'Q' is a machine
> specific constraint which is at least worded differently for 32-vs-64 in
> http://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints although I suppose they both read as the same thing.
> 
> I suppose the answer is that ldrex etc can take [rN, #imm] arguments,
> which is what "Q" rather than "r" is trying to avoid, where as the newer
> armv8 atomic instructions do not take a #imm (it is documented as "[rN,
> #0]"), so you have to use Q there.

Righto, thanks.

Acked-by: Tim Deegan <tim@xen.org>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-29 16:13     ` Ian Campbell
@ 2013-07-29 18:05       ` Will Deacon
  2013-07-30  9:34         ` Ian Campbell
  0 siblings, 1 reply; 16+ messages in thread
From: Will Deacon @ 2013-07-29 18:05 UTC (permalink / raw)
  To: Ian Campbell
  Cc: julien.grall@citrix.com, Catalin Marinas,
	stefano.stabellini@eu.citrix.com, Tim Deegan,
	xen-devel@lists.xen.org

On Mon, Jul 29, 2013 at 05:13:02PM +0100, Ian Campbell wrote:
> On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> > But I don't understand the new 'memory' clobbers around in this patch.
> > Or rather, I don't understand why there are memory clobbers but not
> > DMBs.

The acquire/release instructions imply half barriers, but GCC may still
re-order instructions across them unless we include the memory clobber.

Will

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-29 18:05       ` Will Deacon
@ 2013-07-30  9:34         ` Ian Campbell
  2013-07-30  9:45           ` Will Deacon
  0 siblings, 1 reply; 16+ messages in thread
From: Ian Campbell @ 2013-07-30  9:34 UTC (permalink / raw)
  To: Will Deacon
  Cc: julien.grall@citrix.com, Catalin Marinas,
	stefano.stabellini@eu.citrix.com, Tim Deegan,
	xen-devel@lists.xen.org

On Mon, 2013-07-29 at 19:05 +0100, Will Deacon wrote:
> On Mon, Jul 29, 2013 at 05:13:02PM +0100, Ian Campbell wrote:
> > On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> > > But I don't understand the new 'memory' clobbers around in this patch.
> > > Or rather, I don't understand why there are memory clobbers but not
> > > DMBs.
> 
> The acquire/release instructions imply half barriers, but GCC may still
> re-order instructions across them unless we include the memory clobber.

The instructions which Tim is querying patch the are
load/store-exclusive (ldxr,stxr) not acquire/release (ldaxr/stlxr et al)
ones. The a/r ones do indeed have implicit barriers but do the l/s-excl
do too?

The ARM ARM Tim quoted was the v7 version but I don't see anything in
the v8 docs which I have available (which I know are incomplete and may
well be out of date) which says anything about barriers (implicit or
otherwise) wrt the l/s-excl instructions. I do see stuff relating to the
a/r instructions having implicit barriers.

Ian.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-30  9:34         ` Ian Campbell
@ 2013-07-30  9:45           ` Will Deacon
  2013-07-30  9:55             ` Ian Campbell
  0 siblings, 1 reply; 16+ messages in thread
From: Will Deacon @ 2013-07-30  9:45 UTC (permalink / raw)
  To: Ian Campbell
  Cc: julien.grall@citrix.com, Catalin Marinas,
	stefano.stabellini@eu.citrix.com, Tim Deegan,
	xen-devel@lists.xen.org

On Tue, Jul 30, 2013 at 10:34:12AM +0100, Ian Campbell wrote:
> On Mon, 2013-07-29 at 19:05 +0100, Will Deacon wrote:
> > On Mon, Jul 29, 2013 at 05:13:02PM +0100, Ian Campbell wrote:
> > > On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> > > > But I don't understand the new 'memory' clobbers around in this patch.
> > > > Or rather, I don't understand why there are memory clobbers but not
> > > > DMBs.
> > 
> > The acquire/release instructions imply half barriers, but GCC may still
> > re-order instructions across them unless we include the memory clobber.
> 
> The instructions which Tim is querying patch the are
> load/store-exclusive (ldxr,stxr) not acquire/release (ldaxr/stlxr et al)
> ones. The a/r ones do indeed have implicit barriers but do the l/s-excl
> do too?
> 
> The ARM ARM Tim quoted was the v7 version but I don't see anything in
> the v8 docs which I have available (which I know are incomplete and may
> well be out of date) which says anything about barriers (implicit or
> otherwise) wrt the l/s-excl instructions. I do see stuff relating to the
> a/r instructions having implicit barriers.

Ah sorry, should've looked more closely. This is basically because not all
atomic operations in Linux imply barriers.

Take a look at Documentation/atomic_ops.txt and check whether it matches the
semantics required by Xen.

Will

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-30  9:45           ` Will Deacon
@ 2013-07-30  9:55             ` Ian Campbell
  2013-07-30  9:59               ` Will Deacon
  0 siblings, 1 reply; 16+ messages in thread
From: Ian Campbell @ 2013-07-30  9:55 UTC (permalink / raw)
  To: Will Deacon
  Cc: julien.grall@citrix.com, Catalin Marinas,
	stefano.stabellini@eu.citrix.com, Tim Deegan,
	xen-devel@lists.xen.org

On Tue, 2013-07-30 at 10:45 +0100, Will Deacon wrote:
> On Tue, Jul 30, 2013 at 10:34:12AM +0100, Ian Campbell wrote:
> > On Mon, 2013-07-29 at 19:05 +0100, Will Deacon wrote:
> > > On Mon, Jul 29, 2013 at 05:13:02PM +0100, Ian Campbell wrote:
> > > > On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> > > > > But I don't understand the new 'memory' clobbers around in this patch.
> > > > > Or rather, I don't understand why there are memory clobbers but not
> > > > > DMBs.
> > > 
> > > The acquire/release instructions imply half barriers, but GCC may still
> > > re-order instructions across them unless we include the memory clobber.
> > 
> > The instructions which Tim is querying patch the are
> > load/store-exclusive (ldxr,stxr) not acquire/release (ldaxr/stlxr et al)
> > ones. The a/r ones do indeed have implicit barriers but do the l/s-excl
> > do too?
> > 
> > The ARM ARM Tim quoted was the v7 version but I don't see anything in
> > the v8 docs which I have available (which I know are incomplete and may
> > well be out of date) which says anything about barriers (implicit or
> > otherwise) wrt the l/s-excl instructions. I do see stuff relating to the
> > a/r instructions having implicit barriers.
> 
> Ah sorry, should've looked more closely. This is basically because not all
> atomic operations in Linux imply barriers.
> 
> Take a look at Documentation/atomic_ops.txt and check whether it matches the
> semantics required by Xen.

Ohoo, I wasn't aware of that (it's not that surprising mind). Some
auditing of the Xen code may be required here!

Ian.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-30  9:55             ` Ian Campbell
@ 2013-07-30  9:59               ` Will Deacon
  2013-07-30 10:12                 ` Ian Campbell
  0 siblings, 1 reply; 16+ messages in thread
From: Will Deacon @ 2013-07-30  9:59 UTC (permalink / raw)
  To: Ian Campbell
  Cc: julien.grall@citrix.com, Catalin Marinas,
	stefano.stabellini@eu.citrix.com, Tim Deegan,
	xen-devel@lists.xen.org

On Tue, Jul 30, 2013 at 10:55:01AM +0100, Ian Campbell wrote:
> On Tue, 2013-07-30 at 10:45 +0100, Will Deacon wrote:
> > On Tue, Jul 30, 2013 at 10:34:12AM +0100, Ian Campbell wrote:
> > > On Mon, 2013-07-29 at 19:05 +0100, Will Deacon wrote:
> > > > On Mon, Jul 29, 2013 at 05:13:02PM +0100, Ian Campbell wrote:
> > > > > On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> > > > > > But I don't understand the new 'memory' clobbers around in this patch.
> > > > > > Or rather, I don't understand why there are memory clobbers but not
> > > > > > DMBs.
> > > > 
> > > > The acquire/release instructions imply half barriers, but GCC may still
> > > > re-order instructions across them unless we include the memory clobber.
> > > 
> > > The instructions which Tim is querying patch the are
> > > load/store-exclusive (ldxr,stxr) not acquire/release (ldaxr/stlxr et al)
> > > ones. The a/r ones do indeed have implicit barriers but do the l/s-excl
> > > do too?
> > > 
> > > The ARM ARM Tim quoted was the v7 version but I don't see anything in
> > > the v8 docs which I have available (which I know are incomplete and may
> > > well be out of date) which says anything about barriers (implicit or
> > > otherwise) wrt the l/s-excl instructions. I do see stuff relating to the
> > > a/r instructions having implicit barriers.
> > 
> > Ah sorry, should've looked more closely. This is basically because not all
> > atomic operations in Linux imply barriers.
> > 
> > Take a look at Documentation/atomic_ops.txt and check whether it matches the
> > semantics required by Xen.
> 
> Ohoo, I wasn't aware of that (it's not that surprising mind). Some
> auditing of the Xen code may be required here!

If you need overly strong barrier semantics (i.e. dmb(); op(); dmb()) then
you could look at the GCC atomic builtins, which are fairly heavy on the
barrier-front iirc.

Will

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux
  2013-07-30  9:59               ` Will Deacon
@ 2013-07-30 10:12                 ` Ian Campbell
  0 siblings, 0 replies; 16+ messages in thread
From: Ian Campbell @ 2013-07-30 10:12 UTC (permalink / raw)
  To: Will Deacon
  Cc: julien.grall@citrix.com, Catalin Marinas,
	stefano.stabellini@eu.citrix.com, Tim Deegan,
	xen-devel@lists.xen.org

On Tue, 2013-07-30 at 10:59 +0100, Will Deacon wrote:
> On Tue, Jul 30, 2013 at 10:55:01AM +0100, Ian Campbell wrote:
> > On Tue, 2013-07-30 at 10:45 +0100, Will Deacon wrote:
> > > On Tue, Jul 30, 2013 at 10:34:12AM +0100, Ian Campbell wrote:
> > > > On Mon, 2013-07-29 at 19:05 +0100, Will Deacon wrote:
> > > > > On Mon, Jul 29, 2013 at 05:13:02PM +0100, Ian Campbell wrote:
> > > > > > On Mon, 2013-07-29 at 17:02 +0100, Tim Deegan wrote:
> > > > > > > But I don't understand the new 'memory' clobbers around in this patch.
> > > > > > > Or rather, I don't understand why there are memory clobbers but not
> > > > > > > DMBs.
> > > > > 
> > > > > The acquire/release instructions imply half barriers, but GCC may still
> > > > > re-order instructions across them unless we include the memory clobber.
> > > > 
> > > > The instructions which Tim is querying patch the are
> > > > load/store-exclusive (ldxr,stxr) not acquire/release (ldaxr/stlxr et al)
> > > > ones. The a/r ones do indeed have implicit barriers but do the l/s-excl
> > > > do too?
> > > > 
> > > > The ARM ARM Tim quoted was the v7 version but I don't see anything in
> > > > the v8 docs which I have available (which I know are incomplete and may
> > > > well be out of date) which says anything about barriers (implicit or
> > > > otherwise) wrt the l/s-excl instructions. I do see stuff relating to the
> > > > a/r instructions having implicit barriers.
> > > 
> > > Ah sorry, should've looked more closely. This is basically because not all
> > > atomic operations in Linux imply barriers.
> > > 
> > > Take a look at Documentation/atomic_ops.txt and check whether it matches the
> > > semantics required by Xen.
> > 
> > Ohoo, I wasn't aware of that (it's not that surprising mind). Some
> > auditing of the Xen code may be required here!
> 
> If you need overly strong barrier semantics (i.e. dmb(); op(); dmb()) then
> you could look at the GCC atomic builtins, which are fairly heavy on the
> barrier-front iirc.

Go tip, thanks. Xen's x86 background means it probably assumes some
fairly heavy barriers in some places but we aren't generally averse to
just fixing those where needed.

Ian.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics)
  2013-07-19 15:19 [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
                   ` (2 preceding siblings ...)
  2013-07-19 15:20 ` [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock Ian Campbell
@ 2013-08-22 14:56 ` Ian Campbell
  3 siblings, 0 replies; 16+ messages in thread
From: Ian Campbell @ 2013-08-22 14:56 UTC (permalink / raw)
  To: xen-devel; +Cc: Julien Grall, Tim Deegan, Stefano Stabellini

On Fri, 2013-07-19 at 16:19 +0100, Ian Campbell wrote:
> The following patches update and resync some of the assembly primitives
> which we got from Linux.
> 
> The first two have been posted independently before, the last one is
> new.

All acked by Tim and now applied, thanks.

Ian.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2013-08-22 14:56 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-07-19 15:19 [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell
2013-07-19 15:20 ` [PATCH 1/3] xen/arm64: Assembly optimized bitops from Linux Ian Campbell
2013-07-19 15:20 ` [PATCH 2/3] xen/arm64: resync atomics and spinlock asm with Linux Ian Campbell
2013-07-29 16:02   ` Tim Deegan
2013-07-29 16:13     ` Ian Campbell
2013-07-29 18:05       ` Will Deacon
2013-07-30  9:34         ` Ian Campbell
2013-07-30  9:45           ` Will Deacon
2013-07-30  9:55             ` Ian Campbell
2013-07-30  9:59               ` Will Deacon
2013-07-30 10:12                 ` Ian Campbell
2013-07-19 15:20 ` [PATCH 3/3] xen: arm: retry trylock if strex fails on free lock Ian Campbell
2013-07-29 15:52   ` Tim Deegan
2013-07-29 16:20     ` Ian Campbell
2013-07-29 16:25       ` Tim Deegan
2013-08-22 14:56 ` [PATCH 0/3] xen: arm: update asm primitives (bitops, spinlocks, atomics) Ian Campbell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).