xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] xen/arm64: Assembly optimized bitops from Linux
@ 2013-06-04 16:23 Ian Campbell
  2013-06-04 17:20 ` Tim Deegan
  0 siblings, 1 reply; 4+ messages in thread
From: Ian Campbell @ 2013-06-04 16:23 UTC (permalink / raw)
  To: xen-devel; +Cc: julien.grall, tim, Ian Campbell, stefano.stabellini

This patch replaces the previous hashed lock implementaiton of bitops with
assembly optimized ones taken from Linux v3.10-rc4.

The Linux derived ASM only supports 8 byte aligned bitmaps (which under Linux
are unsigned long * rather than our void *). We do have actually uses of 4
byte alignment (i.e. the bitmaps in struct xmem_pool) which trigger alignment
faults.

Therefore adjust the assembly to work in 4 byte increments, which involved:
    - bit offset now bits 4:0 => mask #31 not #63
    - word offset now bits 31:5 => shift #2 not #3
    - use wN register not xN for load/modify/store loop.

NB: Xen's build system cannot cope with the change from .c to .S file,
remove xen/arch/arm/arm64/lib/.bitops.o.d or clean your build tree.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 xen/arch/arm/arm64/lib/bitops.S    |   68 ++++++++++++
 xen/arch/arm/arm64/lib/bitops.c    |   22 ----
 xen/include/asm-arm/arm64/bitops.h |  203 ++----------------------------------
 3 files changed, 76 insertions(+), 217 deletions(-)
 create mode 100644 xen/arch/arm/arm64/lib/bitops.S
 delete mode 100644 xen/arch/arm/arm64/lib/bitops.c

diff --git a/xen/arch/arm/arm64/lib/bitops.S b/xen/arch/arm/arm64/lib/bitops.S
new file mode 100644
index 0000000..2535deb
--- /dev/null
+++ b/xen/arch/arm/arm64/lib/bitops.S
@@ -0,0 +1,68 @@
+/*
+ * Based on linux/arch/arm64/lib/bitops.h which in turn is
+ * Based on arch/arm/lib/bitops.h
+ *
+ * Copyright (C) 2013 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/config.h>
+
+/*
+ * x0: bits 4:0  bit offset
+ *     bits 31:5 word offset
+ * x1: address
+ */
+	.macro	bitop, name, instr
+ENTRY(	\name	)
+	and	w3, w0, #31		// Get bit offset
+	eor	w0, w0, w3		// Clear low bits
+	mov	x2, #1
+	add	x1, x1, x0, lsr #2	// Get word offset
+	lsl	x3, x2, x3		// Create mask
+1:	ldxr	w2, [x1]
+	\instr	w2, w2, w3
+	stxr	w0, w2, [x1]
+	cbnz	w0, 1b
+	ret
+ENDPROC(\name	)
+	.endm
+
+	.macro	testop, name, instr
+ENTRY(	\name	)
+	and	w3, w0, #31		// Get bit offset
+	eor	w0, w0, w3		// Clear low bits
+	mov	x2, #1
+	add	x1, x1, x0, lsr #2	// Get word offset
+	lsl	x4, x2, x3		// Create mask
+1:	ldaxr	w2, [x1]
+	lsr	w0, w2, w3		// Save old value of bit
+	\instr	w2, w2, w4		// toggle bit
+	stlxr	w5, w2, [x1]
+	cbnz	w5, 1b
+	and	w0, w0, #1
+3:	ret
+ENDPROC(\name	)
+	.endm
+
+/*
+ * Atomic bit operations.
+ */
+	bitop	change_bit, eor
+	bitop	clear_bit, bic
+	bitop	set_bit, orr
+
+	testop	test_and_change_bit, eor
+	testop	test_and_clear_bit, bic
+	testop	test_and_set_bit, orr
diff --git a/xen/arch/arm/arm64/lib/bitops.c b/xen/arch/arm/arm64/lib/bitops.c
deleted file mode 100644
index 02d8d78..0000000
--- a/xen/arch/arm/arm64/lib/bitops.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Limited
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <xen/spinlock.h>
-#include <xen/bitops.h>
-
-spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] /*__lock_aligned*/ = {
-       [0 ... (ATOMIC_HASH_SIZE-1)]  = SPIN_LOCK_UNLOCKED
-};
diff --git a/xen/include/asm-arm/arm64/bitops.h b/xen/include/asm-arm/arm64/bitops.h
index 0a6eba3..b43931d 100644
--- a/xen/include/asm-arm/arm64/bitops.h
+++ b/xen/include/asm-arm/arm64/bitops.h
@@ -1,200 +1,15 @@
 #ifndef _ARM_ARM64_BITOPS_H
 #define _ARM_ARM64_BITOPS_H
 
-/* Generic bitop support. Based on linux/include/asm-generic/bitops/atomic.h */
-
-#include <xen/spinlock.h>
-#include <xen/cache.h>          /* we use L1_CACHE_BYTES */
-
-/* Use an array of spinlocks for our atomic_ts.
- * Hash function to index into a different SPINLOCK.
- * Since "a" is usually an address, use one spinlock per cacheline.
- */
-#  define ATOMIC_HASH_SIZE 4
-#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
-
-extern spinlock_t __atomic_hash[ATOMIC_HASH_SIZE]/* __lock_aligned*/;
-
-#define _atomic_spin_lock_irqsave(l,f) do {     \
-       spinlock_t *s = ATOMIC_HASH(l);          \
-       spin_lock_irqsave(s, f);\
-} while(0)
-
-#define _atomic_spin_unlock_irqrestore(l,f) do {\
-        spinlock_t *s = ATOMIC_HASH(l);         \
-        spin_unlock_irqrestore(s,f);		\
-} while(0)
-
-#define FIXUP(_p, _mask)                        \
-    {                                           \
-        unsigned long __p = (unsigned long)_p;  \
-        if (__p & 0x7) {                        \
-            if (_mask > 0xffffffff) {           \
-             __p = (__p+32)&~0x7; _mask >>=32;  \
-            } else {                            \
-                __p &= ~0x7; _mask <<= 32;      \
-            }                                   \
-            if (0)printk("BITOPS: Fixup misaligned ptr %p => %#lx\n", _p, __p); \
-            _p = (void *)__p;                   \
-        }                                       \
-    }
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writing portable code,
- * make sure not to rely on its reordering guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-
-static inline void set_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
-
-        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
-        //       nr, addr, mask, p, ATOMIC_HASH(p));
-        FIXUP(p, mask);
-        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
-        //       nr, addr, mask, p, ATOMIC_HASH(p));
-        //printk("before *p is %#lx\n", *p);
-	_atomic_spin_lock_irqsave(p, flags);
-	*p  |= mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-        //printk(" after *p is %#lx\n", *p);
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	*p &= ~mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-}
-
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered. It may be
- * reordered on other architectures than x86.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	*p ^= mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It may be reordered on other architectures than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old | mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-
-	return (old & mask) != 0;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It can be reorderdered on other architectures other than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old & ~mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-
-	return (old & mask) != 0;
-}
-
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
+/*
+ * Little endian assembly atomic bitops.
  */
-static inline int test_and_change_bit(int nr, volatile void *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-        FIXUP(p, mask);
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old ^ mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
-
-	return (old & mask) != 0;
-}
+extern void set_bit(int nr, volatile void *p);
+extern void clear_bit(int nr, volatile void *p);
+extern void change_bit(int nr, volatile void *p);
+extern int test_and_set_bit(int nr, volatile void *p);
+extern int test_and_clear_bit(int nr, volatile void *p);
+extern int test_and_change_bit(int nr, volatile void *p);
 
 /* Based on linux/include/asm-generic/bitops/builtin-__ffs.h */
 /**
@@ -217,8 +32,6 @@ static /*__*/always_inline unsigned long __ffs(unsigned long word)
  */
 #define ffz(x)  __ffs(~(x))
 
-
-
 /* Based on linux/include/asm-generic/bitops/find.h */
 
 #ifndef find_next_bit
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] xen/arm64: Assembly optimized bitops from Linux
  2013-06-04 16:23 [PATCH] xen/arm64: Assembly optimized bitops from Linux Ian Campbell
@ 2013-06-04 17:20 ` Tim Deegan
  2013-06-05  9:18   ` Ian Campbell
  0 siblings, 1 reply; 4+ messages in thread
From: Tim Deegan @ 2013-06-04 17:20 UTC (permalink / raw)
  To: Ian Campbell; +Cc: julien.grall, stefano.stabellini, xen-devel

Hi,

At 17:23 +0100 on 04 Jun (1370366607), Ian Campbell wrote:
> This patch replaces the previous hashed lock implementaiton of bitops with
> assembly optimized ones taken from Linux v3.10-rc4.

I understand that you took this code from linux, but:

> +/*
> + * x0: bits 4:0  bit offset
> + *     bits 31:5 word offset
> + * x1: address
> + */
> +	.macro	bitop, name, instr
> +ENTRY(	\name	)
> +	and	w3, w0, #31		// Get bit offset
> +	eor	w0, w0, w3		// Clear low bits

Not 'bic w0, w0, #31'?  The EOR has a dependence on the previous
instruction.  (Ditto for testop below).

Otherwise, Acked-by: Tim Deegan <tim@xen.org>

Cheers,

Tim.

> +	mov	x2, #1
> +	add	x1, x1, x0, lsr #2	// Get word offset
> +	lsl	x3, x2, x3		// Create mask
> +1:	ldxr	w2, [x1]
> +	\instr	w2, w2, w3
> +	stxr	w0, w2, [x1]
> +	cbnz	w0, 1b
> +	ret
> +ENDPROC(\name	)
> +	.endm
> +
> +	.macro	testop, name, instr
> +ENTRY(	\name	)
> +	and	w3, w0, #31		// Get bit offset
> +	eor	w0, w0, w3		// Clear low bits
> +	mov	x2, #1
> +	add	x1, x1, x0, lsr #2	// Get word offset
> +	lsl	x4, x2, x3		// Create mask
> +1:	ldaxr	w2, [x1]
> +	lsr	w0, w2, w3		// Save old value of bit
> +	\instr	w2, w2, w4		// toggle bit
> +	stlxr	w5, w2, [x1]
> +	cbnz	w5, 1b
> +	and	w0, w0, #1
> +3:	ret
> +ENDPROC(\name	)
> +	.endm
> +
> +/*
> + * Atomic bit operations.
> + */
> +	bitop	change_bit, eor
> +	bitop	clear_bit, bic
> +	bitop	set_bit, orr
> +
> +	testop	test_and_change_bit, eor
> +	testop	test_and_clear_bit, bic
> +	testop	test_and_set_bit, orr
> diff --git a/xen/arch/arm/arm64/lib/bitops.c b/xen/arch/arm/arm64/lib/bitops.c
> deleted file mode 100644
> index 02d8d78..0000000
> --- a/xen/arch/arm/arm64/lib/bitops.c
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/*
> - * Copyright (C) 2012 ARM Limited
> - *
> - * This program is free software; you can redistribute it and/or modify
> - * it under the terms of the GNU General Public License version 2 as
> - * published by the Free Software Foundation.
> - *
> - * This program is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> - * GNU General Public License for more details.
> - *
> - * You should have received a copy of the GNU General Public License
> - * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> - */
> -
> -#include <xen/spinlock.h>
> -#include <xen/bitops.h>
> -
> -spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] /*__lock_aligned*/ = {
> -       [0 ... (ATOMIC_HASH_SIZE-1)]  = SPIN_LOCK_UNLOCKED
> -};
> diff --git a/xen/include/asm-arm/arm64/bitops.h b/xen/include/asm-arm/arm64/bitops.h
> index 0a6eba3..b43931d 100644
> --- a/xen/include/asm-arm/arm64/bitops.h
> +++ b/xen/include/asm-arm/arm64/bitops.h
> @@ -1,200 +1,15 @@
>  #ifndef _ARM_ARM64_BITOPS_H
>  #define _ARM_ARM64_BITOPS_H
>  
> -/* Generic bitop support. Based on linux/include/asm-generic/bitops/atomic.h */
> -
> -#include <xen/spinlock.h>
> -#include <xen/cache.h>          /* we use L1_CACHE_BYTES */
> -
> -/* Use an array of spinlocks for our atomic_ts.
> - * Hash function to index into a different SPINLOCK.
> - * Since "a" is usually an address, use one spinlock per cacheline.
> - */
> -#  define ATOMIC_HASH_SIZE 4
> -#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
> -
> -extern spinlock_t __atomic_hash[ATOMIC_HASH_SIZE]/* __lock_aligned*/;
> -
> -#define _atomic_spin_lock_irqsave(l,f) do {     \
> -       spinlock_t *s = ATOMIC_HASH(l);          \
> -       spin_lock_irqsave(s, f);\
> -} while(0)
> -
> -#define _atomic_spin_unlock_irqrestore(l,f) do {\
> -        spinlock_t *s = ATOMIC_HASH(l);         \
> -        spin_unlock_irqrestore(s,f);		\
> -} while(0)
> -
> -#define FIXUP(_p, _mask)                        \
> -    {                                           \
> -        unsigned long __p = (unsigned long)_p;  \
> -        if (__p & 0x7) {                        \
> -            if (_mask > 0xffffffff) {           \
> -             __p = (__p+32)&~0x7; _mask >>=32;  \
> -            } else {                            \
> -                __p &= ~0x7; _mask <<= 32;      \
> -            }                                   \
> -            if (0)printk("BITOPS: Fixup misaligned ptr %p => %#lx\n", _p, __p); \
> -            _p = (void *)__p;                   \
> -        }                                       \
> -    }
> -
> -/**
> - * set_bit - Atomically set a bit in memory
> - * @nr: the bit to set
> - * @addr: the address to start counting from
> - *
> - * This function is atomic and may not be reordered.  See __set_bit()
> - * if you do not require the atomic guarantees.
> - *
> - * Note: there are no guarantees that this function will not be reordered
> - * on non x86 architectures, so if you are writing portable code,
> - * make sure not to rely on its reordering guarantees.
> - *
> - * Note that @nr may be almost arbitrarily large; this function is not
> - * restricted to acting on a single-word quantity.
> - */
> -
> -static inline void set_bit(int nr, volatile void *addr)
> -{
> -	unsigned long mask = BIT_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> -	unsigned long flags;
> -
> -        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
> -        //       nr, addr, mask, p, ATOMIC_HASH(p));
> -        FIXUP(p, mask);
> -        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
> -        //       nr, addr, mask, p, ATOMIC_HASH(p));
> -        //printk("before *p is %#lx\n", *p);
> -	_atomic_spin_lock_irqsave(p, flags);
> -	*p  |= mask;
> -	_atomic_spin_unlock_irqrestore(p, flags);
> -        //printk(" after *p is %#lx\n", *p);
> -}
> -
> -/**
> - * clear_bit - Clears a bit in memory
> - * @nr: Bit to clear
> - * @addr: Address to start counting from
> - *
> - * clear_bit() is atomic and may not be reordered.  However, it does
> - * not contain a memory barrier, so if it is used for locking purposes,
> - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
> - * in order to ensure changes are visible on other processors.
> - */
> -static inline void clear_bit(int nr, volatile void *addr)
> -{
> -	unsigned long mask = BIT_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> -	unsigned long flags;
> -
> -        FIXUP(p, mask);
> -
> -	_atomic_spin_lock_irqsave(p, flags);
> -	*p &= ~mask;
> -	_atomic_spin_unlock_irqrestore(p, flags);
> -}
> -
> -/**
> - * change_bit - Toggle a bit in memory
> - * @nr: Bit to change
> - * @addr: Address to start counting from
> - *
> - * change_bit() is atomic and may not be reordered. It may be
> - * reordered on other architectures than x86.
> - * Note that @nr may be almost arbitrarily large; this function is not
> - * restricted to acting on a single-word quantity.
> - */
> -static inline void change_bit(int nr, volatile void *addr)
> -{
> -	unsigned long mask = BIT_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> -	unsigned long flags;
> -
> -        FIXUP(p, mask);
> -
> -	_atomic_spin_lock_irqsave(p, flags);
> -	*p ^= mask;
> -	_atomic_spin_unlock_irqrestore(p, flags);
> -}
> -
> -/**
> - * test_and_set_bit - Set a bit and return its old value
> - * @nr: Bit to set
> - * @addr: Address to count from
> - *
> - * This operation is atomic and cannot be reordered.
> - * It may be reordered on other architectures than x86.
> - * It also implies a memory barrier.
> - */
> -static inline int test_and_set_bit(int nr, volatile void *addr)
> -{
> -	unsigned long mask = BIT_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> -	unsigned long old;
> -	unsigned long flags;
> -
> -        FIXUP(p, mask);
> -
> -	_atomic_spin_lock_irqsave(p, flags);
> -	old = *p;
> -	*p = old | mask;
> -	_atomic_spin_unlock_irqrestore(p, flags);
> -
> -	return (old & mask) != 0;
> -}
> -
> -/**
> - * test_and_clear_bit - Clear a bit and return its old value
> - * @nr: Bit to clear
> - * @addr: Address to count from
> - *
> - * This operation is atomic and cannot be reordered.
> - * It can be reorderdered on other architectures other than x86.
> - * It also implies a memory barrier.
> - */
> -static inline int test_and_clear_bit(int nr, volatile void *addr)
> -{
> -	unsigned long mask = BIT_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> -	unsigned long old;
> -	unsigned long flags;
> -
> -        FIXUP(p, mask);
> -
> -	_atomic_spin_lock_irqsave(p, flags);
> -	old = *p;
> -	*p = old & ~mask;
> -	_atomic_spin_unlock_irqrestore(p, flags);
> -
> -	return (old & mask) != 0;
> -}
> -
> -/**
> - * test_and_change_bit - Change a bit and return its old value
> - * @nr: Bit to change
> - * @addr: Address to count from
> - *
> - * This operation is atomic and cannot be reordered.
> - * It also implies a memory barrier.
> +/*
> + * Little endian assembly atomic bitops.
>   */
> -static inline int test_and_change_bit(int nr, volatile void *addr)
> -{
> -	unsigned long mask = BIT_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> -	unsigned long old;
> -	unsigned long flags;
> -
> -        FIXUP(p, mask);
> -
> -	_atomic_spin_lock_irqsave(p, flags);
> -	old = *p;
> -	*p = old ^ mask;
> -	_atomic_spin_unlock_irqrestore(p, flags);
> -
> -	return (old & mask) != 0;
> -}
> +extern void set_bit(int nr, volatile void *p);
> +extern void clear_bit(int nr, volatile void *p);
> +extern void change_bit(int nr, volatile void *p);
> +extern int test_and_set_bit(int nr, volatile void *p);
> +extern int test_and_clear_bit(int nr, volatile void *p);
> +extern int test_and_change_bit(int nr, volatile void *p);
>  
>  /* Based on linux/include/asm-generic/bitops/builtin-__ffs.h */
>  /**
> @@ -217,8 +32,6 @@ static /*__*/always_inline unsigned long __ffs(unsigned long word)
>   */
>  #define ffz(x)  __ffs(~(x))
>  
> -
> -
>  /* Based on linux/include/asm-generic/bitops/find.h */
>  
>  #ifndef find_next_bit
> -- 
> 1.7.2.5
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] xen/arm64: Assembly optimized bitops from Linux
  2013-06-04 17:20 ` Tim Deegan
@ 2013-06-05  9:18   ` Ian Campbell
  2013-06-05 10:16     ` Tim Deegan
  0 siblings, 1 reply; 4+ messages in thread
From: Ian Campbell @ 2013-06-05  9:18 UTC (permalink / raw)
  To: Tim Deegan; +Cc: julien.grall, stefano.stabellini, xen-devel

On Tue, 2013-06-04 at 18:20 +0100, Tim Deegan wrote:
> Hi,
> 
> At 17:23 +0100 on 04 Jun (1370366607), Ian Campbell wrote:
> > This patch replaces the previous hashed lock implementaiton of bitops with
> > assembly optimized ones taken from Linux v3.10-rc4.
> 
> I understand that you took this code from linux, but:
> 
> > +/*
> > + * x0: bits 4:0  bit offset
> > + *     bits 31:5 word offset
> > + * x1: address
> > + */
> > +	.macro	bitop, name, instr
> > +ENTRY(	\name	)
> > +	and	w3, w0, #31		// Get bit offset
> > +	eor	w0, w0, w3		// Clear low bits
> 
> Not 'bic w0, w0, #31'?  The EOR has a dependence on the previous
> instruction.  (Ditto for testop below).

Interesting question.

The 32-bit version uses a right shift, which it then undoes by turning
the right shift in the following add into the appropriate left shift.

I'll put this to the Linux ARM guys but would prefer to keep the code in
sync (as far as possible) for now.

Your questioning this did cause me to look again and I think I was wrong
to change:
	add	x1, x1, x0, lsr #3	// Get word offset
into
	add	x1, x1, x0, lsr #2	// Get word offset

The difference between 4-byte and 8-byte offsetting is already accounted
for in the masking. The #3 relates to the number of bits in a byte, not
the number of bytes in a word.

> Otherwise, Acked-by: Tim Deegan <tim@xen.org>
> 
> Cheers,
> 
> Tim.
> 
> > +	mov	x2, #1
> > +	add	x1, x1, x0, lsr #2	// Get word offset
> > +	lsl	x3, x2, x3		// Create mask
> > +1:	ldxr	w2, [x1]
> > +	\instr	w2, w2, w3
> > +	stxr	w0, w2, [x1]
> > +	cbnz	w0, 1b
> > +	ret
> > +ENDPROC(\name	)
> > +	.endm
> > +
> > +	.macro	testop, name, instr
> > +ENTRY(	\name	)
> > +	and	w3, w0, #31		// Get bit offset
> > +	eor	w0, w0, w3		// Clear low bits
> > +	mov	x2, #1
> > +	add	x1, x1, x0, lsr #2	// Get word offset
> > +	lsl	x4, x2, x3		// Create mask
> > +1:	ldaxr	w2, [x1]
> > +	lsr	w0, w2, w3		// Save old value of bit
> > +	\instr	w2, w2, w4		// toggle bit
> > +	stlxr	w5, w2, [x1]
> > +	cbnz	w5, 1b
> > +	and	w0, w0, #1
> > +3:	ret
> > +ENDPROC(\name	)
> > +	.endm
> > +
> > +/*
> > + * Atomic bit operations.
> > + */
> > +	bitop	change_bit, eor
> > +	bitop	clear_bit, bic
> > +	bitop	set_bit, orr
> > +
> > +	testop	test_and_change_bit, eor
> > +	testop	test_and_clear_bit, bic
> > +	testop	test_and_set_bit, orr
> > diff --git a/xen/arch/arm/arm64/lib/bitops.c b/xen/arch/arm/arm64/lib/bitops.c
> > deleted file mode 100644
> > index 02d8d78..0000000
> > --- a/xen/arch/arm/arm64/lib/bitops.c
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/*
> > - * Copyright (C) 2012 ARM Limited
> > - *
> > - * This program is free software; you can redistribute it and/or modify
> > - * it under the terms of the GNU General Public License version 2 as
> > - * published by the Free Software Foundation.
> > - *
> > - * This program is distributed in the hope that it will be useful,
> > - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > - * GNU General Public License for more details.
> > - *
> > - * You should have received a copy of the GNU General Public License
> > - * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> > - */
> > -
> > -#include <xen/spinlock.h>
> > -#include <xen/bitops.h>
> > -
> > -spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] /*__lock_aligned*/ = {
> > -       [0 ... (ATOMIC_HASH_SIZE-1)]  = SPIN_LOCK_UNLOCKED
> > -};
> > diff --git a/xen/include/asm-arm/arm64/bitops.h b/xen/include/asm-arm/arm64/bitops.h
> > index 0a6eba3..b43931d 100644
> > --- a/xen/include/asm-arm/arm64/bitops.h
> > +++ b/xen/include/asm-arm/arm64/bitops.h
> > @@ -1,200 +1,15 @@
> >  #ifndef _ARM_ARM64_BITOPS_H
> >  #define _ARM_ARM64_BITOPS_H
> >  
> > -/* Generic bitop support. Based on linux/include/asm-generic/bitops/atomic.h */
> > -
> > -#include <xen/spinlock.h>
> > -#include <xen/cache.h>          /* we use L1_CACHE_BYTES */
> > -
> > -/* Use an array of spinlocks for our atomic_ts.
> > - * Hash function to index into a different SPINLOCK.
> > - * Since "a" is usually an address, use one spinlock per cacheline.
> > - */
> > -#  define ATOMIC_HASH_SIZE 4
> > -#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
> > -
> > -extern spinlock_t __atomic_hash[ATOMIC_HASH_SIZE]/* __lock_aligned*/;
> > -
> > -#define _atomic_spin_lock_irqsave(l,f) do {     \
> > -       spinlock_t *s = ATOMIC_HASH(l);          \
> > -       spin_lock_irqsave(s, f);\
> > -} while(0)
> > -
> > -#define _atomic_spin_unlock_irqrestore(l,f) do {\
> > -        spinlock_t *s = ATOMIC_HASH(l);         \
> > -        spin_unlock_irqrestore(s,f);		\
> > -} while(0)
> > -
> > -#define FIXUP(_p, _mask)                        \
> > -    {                                           \
> > -        unsigned long __p = (unsigned long)_p;  \
> > -        if (__p & 0x7) {                        \
> > -            if (_mask > 0xffffffff) {           \
> > -             __p = (__p+32)&~0x7; _mask >>=32;  \
> > -            } else {                            \
> > -                __p &= ~0x7; _mask <<= 32;      \
> > -            }                                   \
> > -            if (0)printk("BITOPS: Fixup misaligned ptr %p => %#lx\n", _p, __p); \
> > -            _p = (void *)__p;                   \
> > -        }                                       \
> > -    }
> > -
> > -/**
> > - * set_bit - Atomically set a bit in memory
> > - * @nr: the bit to set
> > - * @addr: the address to start counting from
> > - *
> > - * This function is atomic and may not be reordered.  See __set_bit()
> > - * if you do not require the atomic guarantees.
> > - *
> > - * Note: there are no guarantees that this function will not be reordered
> > - * on non x86 architectures, so if you are writing portable code,
> > - * make sure not to rely on its reordering guarantees.
> > - *
> > - * Note that @nr may be almost arbitrarily large; this function is not
> > - * restricted to acting on a single-word quantity.
> > - */
> > -
> > -static inline void set_bit(int nr, volatile void *addr)
> > -{
> > -	unsigned long mask = BIT_MASK(nr);
> > -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> > -	unsigned long flags;
> > -
> > -        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
> > -        //       nr, addr, mask, p, ATOMIC_HASH(p));
> > -        FIXUP(p, mask);
> > -        //printk("set_bit: nr %d addr %p mask %#lx p %p lock %p\n",
> > -        //       nr, addr, mask, p, ATOMIC_HASH(p));
> > -        //printk("before *p is %#lx\n", *p);
> > -	_atomic_spin_lock_irqsave(p, flags);
> > -	*p  |= mask;
> > -	_atomic_spin_unlock_irqrestore(p, flags);
> > -        //printk(" after *p is %#lx\n", *p);
> > -}
> > -
> > -/**
> > - * clear_bit - Clears a bit in memory
> > - * @nr: Bit to clear
> > - * @addr: Address to start counting from
> > - *
> > - * clear_bit() is atomic and may not be reordered.  However, it does
> > - * not contain a memory barrier, so if it is used for locking purposes,
> > - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
> > - * in order to ensure changes are visible on other processors.
> > - */
> > -static inline void clear_bit(int nr, volatile void *addr)
> > -{
> > -	unsigned long mask = BIT_MASK(nr);
> > -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> > -	unsigned long flags;
> > -
> > -        FIXUP(p, mask);
> > -
> > -	_atomic_spin_lock_irqsave(p, flags);
> > -	*p &= ~mask;
> > -	_atomic_spin_unlock_irqrestore(p, flags);
> > -}
> > -
> > -/**
> > - * change_bit - Toggle a bit in memory
> > - * @nr: Bit to change
> > - * @addr: Address to start counting from
> > - *
> > - * change_bit() is atomic and may not be reordered. It may be
> > - * reordered on other architectures than x86.
> > - * Note that @nr may be almost arbitrarily large; this function is not
> > - * restricted to acting on a single-word quantity.
> > - */
> > -static inline void change_bit(int nr, volatile void *addr)
> > -{
> > -	unsigned long mask = BIT_MASK(nr);
> > -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> > -	unsigned long flags;
> > -
> > -        FIXUP(p, mask);
> > -
> > -	_atomic_spin_lock_irqsave(p, flags);
> > -	*p ^= mask;
> > -	_atomic_spin_unlock_irqrestore(p, flags);
> > -}
> > -
> > -/**
> > - * test_and_set_bit - Set a bit and return its old value
> > - * @nr: Bit to set
> > - * @addr: Address to count from
> > - *
> > - * This operation is atomic and cannot be reordered.
> > - * It may be reordered on other architectures than x86.
> > - * It also implies a memory barrier.
> > - */
> > -static inline int test_and_set_bit(int nr, volatile void *addr)
> > -{
> > -	unsigned long mask = BIT_MASK(nr);
> > -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> > -	unsigned long old;
> > -	unsigned long flags;
> > -
> > -        FIXUP(p, mask);
> > -
> > -	_atomic_spin_lock_irqsave(p, flags);
> > -	old = *p;
> > -	*p = old | mask;
> > -	_atomic_spin_unlock_irqrestore(p, flags);
> > -
> > -	return (old & mask) != 0;
> > -}
> > -
> > -/**
> > - * test_and_clear_bit - Clear a bit and return its old value
> > - * @nr: Bit to clear
> > - * @addr: Address to count from
> > - *
> > - * This operation is atomic and cannot be reordered.
> > - * It can be reorderdered on other architectures other than x86.
> > - * It also implies a memory barrier.
> > - */
> > -static inline int test_and_clear_bit(int nr, volatile void *addr)
> > -{
> > -	unsigned long mask = BIT_MASK(nr);
> > -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> > -	unsigned long old;
> > -	unsigned long flags;
> > -
> > -        FIXUP(p, mask);
> > -
> > -	_atomic_spin_lock_irqsave(p, flags);
> > -	old = *p;
> > -	*p = old & ~mask;
> > -	_atomic_spin_unlock_irqrestore(p, flags);
> > -
> > -	return (old & mask) != 0;
> > -}
> > -
> > -/**
> > - * test_and_change_bit - Change a bit and return its old value
> > - * @nr: Bit to change
> > - * @addr: Address to count from
> > - *
> > - * This operation is atomic and cannot be reordered.
> > - * It also implies a memory barrier.
> > +/*
> > + * Little endian assembly atomic bitops.
> >   */
> > -static inline int test_and_change_bit(int nr, volatile void *addr)
> > -{
> > -	unsigned long mask = BIT_MASK(nr);
> > -	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
> > -	unsigned long old;
> > -	unsigned long flags;
> > -
> > -        FIXUP(p, mask);
> > -
> > -	_atomic_spin_lock_irqsave(p, flags);
> > -	old = *p;
> > -	*p = old ^ mask;
> > -	_atomic_spin_unlock_irqrestore(p, flags);
> > -
> > -	return (old & mask) != 0;
> > -}
> > +extern void set_bit(int nr, volatile void *p);
> > +extern void clear_bit(int nr, volatile void *p);
> > +extern void change_bit(int nr, volatile void *p);
> > +extern int test_and_set_bit(int nr, volatile void *p);
> > +extern int test_and_clear_bit(int nr, volatile void *p);
> > +extern int test_and_change_bit(int nr, volatile void *p);
> >  
> >  /* Based on linux/include/asm-generic/bitops/builtin-__ffs.h */
> >  /**
> > @@ -217,8 +32,6 @@ static /*__*/always_inline unsigned long __ffs(unsigned long word)
> >   */
> >  #define ffz(x)  __ffs(~(x))
> >  
> > -
> > -
> >  /* Based on linux/include/asm-generic/bitops/find.h */
> >  
> >  #ifndef find_next_bit
> > -- 
> > 1.7.2.5
> > 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] xen/arm64: Assembly optimized bitops from Linux
  2013-06-05  9:18   ` Ian Campbell
@ 2013-06-05 10:16     ` Tim Deegan
  0 siblings, 0 replies; 4+ messages in thread
From: Tim Deegan @ 2013-06-05 10:16 UTC (permalink / raw)
  To: Ian Campbell; +Cc: julien.grall, xen-devel, stefano.stabellini

Hi,

At 10:18 +0100 on 05 Jun (1370427505), Ian Campbell wrote:
> On Tue, 2013-06-04 at 18:20 +0100, Tim Deegan wrote:
> > At 17:23 +0100 on 04 Jun (1370366607), Ian Campbell wrote:
> > > +	and	w3, w0, #31		// Get bit offset
> > > +	eor	w0, w0, w3		// Clear low bits
> > 
> > Not 'bic w0, w0, #31'?  The EOR has a dependence on the previous
> > instruction.  (Ditto for testop below).
> 
> Interesting question.
> 
> The 32-bit version uses a right shift, which it then undoes by turning
> the right shift in the following add into the appropriate left shift.

That seems about equivalent to me (in as much as I have no idea of the
relative costs of lsr and bic).

> I'll put this to the Linux ARM guys but would prefer to keep the code in
> sync (as far as possible) for now.

Fair enough.

> Your questioning this did cause me to look again and I think I was wrong
> to change:
> 	add	x1, x1, x0, lsr #3	// Get word offset
> into
> 	add	x1, x1, x0, lsr #2	// Get word offset
> 
> The difference between 4-byte and 8-byte offsetting is already accounted
> for in the masking. The #3 relates to the number of bits in a byte, not
> the number of bytes in a word.

Erk.  Yes, that seems right. :)

Tim.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-06-05 10:16 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-06-04 16:23 [PATCH] xen/arm64: Assembly optimized bitops from Linux Ian Campbell
2013-06-04 17:20 ` Tim Deegan
2013-06-05  9:18   ` Ian Campbell
2013-06-05 10:16     ` Tim Deegan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).