* [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
@ 2024-08-19 18:30 Christoph Lameter via B4 Relay
2024-08-23 10:32 ` Will Deacon
2024-08-23 21:05 ` Thomas Gleixner
0 siblings, 2 replies; 7+ messages in thread
From: Christoph Lameter via B4 Relay @ 2024-08-19 18:30 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, Peter Zijlstra, Ingo Molnar,
Waiman Long, Boqun Feng
Cc: Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel,
linux-arch, Christoph Lameter (Ampere)
From: "Christoph Lameter (Ampere)" <cl@gentwo.org>
Some architectures support load acquire which can save us a memory
barrier and save some cycles.
A typical sequence
do {
seq = read_seqcount_begin(&s);
<something>
} while (read_seqcount_retry(&s, seq);
requires 13 cycles on ARM64 for an empty loop. Two read memory
barriers are needed. One for each of the seqcount_* functions.
We can replace the first read barrier with a load acquire of
the seqcount which saves us one barrier.
On ARM64 doing so reduces the cycle count from 13 to 8.
This is a general improvement for the ARM64 architecture and not
specific to a certain processor. The cycle count here was
obtained on a Neoverse N1 (Ampere Altra).
The ARM documentation states that load acquire is more effective
than a load plus barrier. In general that tends to be true on all
compute platforms that support both.
See (as quoted by Linus Torvalds):
https://developer.arm.com/documentation/102336/0100/Load-Acquire-and-Store-Release-instructions
"Weaker ordering requirements that are imposed by Load-Acquire and
Store-Release instructions allow for micro-architectural
optimizations, which could reduce some of the performance impacts that
are otherwise imposed by an explicit memory barrier.
If the ordering requirement is satisfied using either a Load-Acquire
or Store-Release, then it would be preferable to use these
instructions instead of a DMB"
Signed-off-by: Christoph Lameter (Ampere) <cl@gentwo.org>
---
V1->V2
- Describe the benefit of load acquire vs barriers
- Explain the CONFIG_ARCH_HAS_ACQUIRE_RELEASE option better
---
arch/Kconfig | 8 ++++++++
arch/arm64/Kconfig | 1 +
include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 50 insertions(+)
diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd..3c270f496231 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1600,6 +1600,14 @@ config ARCH_HAS_KERNEL_FPU_SUPPORT
Architectures that select this option can run floating-point code in
the kernel, as described in Documentation/core-api/floating-point.rst.
+config ARCH_HAS_ACQUIRE_RELEASE
+ bool
+ help
+ Setting ARCH_HAS_ACQUIRE_RELEASE indicates that the architecture
+ supports load acquire and release. Typically these are more effective
+ than memory barriers. Code will prefer the use of load acquire and
+ store release over memory barriers if this option is enabled.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca6..19e34fff145f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -39,6 +39,7 @@ config ARM64
select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_HW_PTE_YOUNG
+ select ARCH_HAS_ACQUIRE_RELEASE
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d90d8ee29d81..353fcf32b800 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
return seq; \
} \
\
+static __always_inline unsigned \
+__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
+{ \
+ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \
+ \
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
+ return seq; \
+ \
+ if (preemptible && unlikely(seq & 1)) { \
+ __SEQ_LOCK(lockbase##_lock(s->lock)); \
+ __SEQ_LOCK(lockbase##_unlock(s->lock)); \
+ \
+ /* \
+ * Re-read the sequence counter since the (possibly \
+ * preempted) writer made progress. \
+ */ \
+ seq = smp_load_acquire(&s->seqcount.sequence); \
+ } \
+ \
+ return seq; \
+} \
+ \
static __always_inline bool \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \
{ \
@@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s)
return READ_ONCE(s->sequence);
}
+static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s)
+{
+ return smp_load_acquire(&s->sequence);
+}
+
static inline bool __seqprop_preemptible(const seqcount_t *s)
{
return false;
@@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
#define seqprop_ptr(s) __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s) __seqprop(s, sequence)(s)
+#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s)
#define seqprop_preemptible(s) __seqprop(s, preemptible)(s)
#define seqprop_assert(s) __seqprop(s, assert)(s)
@@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
*
* Return: count to be passed to read_seqcount_retry()
*/
+#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
+#define raw_read_seqcount_begin(s) \
+({ \
+ unsigned _seq; \
+ \
+ while ((_seq = seqprop_sequence_acquire(s)) & 1) \
+ cpu_relax(); \
+ \
+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
+ _seq; \
+})
+#else
#define raw_read_seqcount_begin(s) \
({ \
unsigned _seq = __read_seqcount_begin(s); \
@@ -300,6 +340,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
smp_rmb(); \
_seq; \
})
+#endif
/**
* read_seqcount_begin() - begin a seqcount_t read critical section
---
base-commit: b0da640826ba3b6506b4996a6b23a429235e6923
change-id: 20240813-seq_optimize-68c48696c798
Best regards,
--
Christoph Lameter <cl@gentwo.org>
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire 2024-08-19 18:30 [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire Christoph Lameter via B4 Relay @ 2024-08-23 10:32 ` Will Deacon 2024-08-23 17:56 ` Christoph Lameter (Ampere) 2024-08-23 19:38 ` Christoph Lameter (Ampere) 2024-08-23 21:05 ` Thomas Gleixner 1 sibling, 2 replies; 7+ messages in thread From: Will Deacon @ 2024-08-23 10:32 UTC (permalink / raw) To: cl Cc: Catalin Marinas, Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng, Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel, linux-arch On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote: > diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h > index d90d8ee29d81..353fcf32b800 100644 > --- a/include/linux/seqlock.h > +++ b/include/linux/seqlock.h > @@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ > return seq; \ > } \ > \ > +static __always_inline unsigned \ > +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \ > +{ \ > + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ > + \ > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ > + return seq; \ > + \ > + if (preemptible && unlikely(seq & 1)) { \ > + __SEQ_LOCK(lockbase##_lock(s->lock)); \ > + __SEQ_LOCK(lockbase##_unlock(s->lock)); \ > + \ > + /* \ > + * Re-read the sequence counter since the (possibly \ > + * preempted) writer made progress. \ > + */ \ > + seq = smp_load_acquire(&s->seqcount.sequence); \ We could probably do even better with LDAPR here, as that should be sufficient for this. It's a can of worms though, as it's not implemented on all CPUs and relaxing smp_load_acquire() might introduce subtle breakage in places where it's used to build other types of lock. Maybe you can hack something to see if there's any performance left behind without it? > + } \ > + \ > + return seq; \ > +} \ > + \ > static __always_inline bool \ > __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ > { \ > @@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s) > return READ_ONCE(s->sequence); > } > > +static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s) > +{ > + return smp_load_acquire(&s->sequence); > +} > + > static inline bool __seqprop_preemptible(const seqcount_t *s) > { > return false; > @@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) > #define seqprop_ptr(s) __seqprop(s, ptr)(s) > #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) > #define seqprop_sequence(s) __seqprop(s, sequence)(s) > +#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s) > #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) > #define seqprop_assert(s) __seqprop(s, assert)(s) > > @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) > * > * Return: count to be passed to read_seqcount_retry() > */ > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE > +#define raw_read_seqcount_begin(s) \ > +({ \ > + unsigned _seq; \ > + \ > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ > + cpu_relax(); \ It would also be interesting to see whether smp_cond_load_acquire() performs any better that this loop in the !RT case. Both things to look at separately though, so: Acked-by: Will Deacon <will@kernel.org> I assume this will go via -tip. Will ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire 2024-08-23 10:32 ` Will Deacon @ 2024-08-23 17:56 ` Christoph Lameter (Ampere) 2024-08-23 19:38 ` Christoph Lameter (Ampere) 1 sibling, 0 replies; 7+ messages in thread From: Christoph Lameter (Ampere) @ 2024-08-23 17:56 UTC (permalink / raw) To: Will Deacon Cc: Catalin Marinas, Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng, Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel, linux-arch On Fri, 23 Aug 2024, Will Deacon wrote: > On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote: > > +static __always_inline unsigned \ > > +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \ > > +{ \ > > + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ > > + \ > > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ > > + return seq; \ > > + \ > > + if (preemptible && unlikely(seq & 1)) { \ > > + __SEQ_LOCK(lockbase##_lock(s->lock)); \ > > + __SEQ_LOCK(lockbase##_unlock(s->lock)); \ > > + \ > > + /* \ > > + * Re-read the sequence counter since the (possibly \ > > + * preempted) writer made progress. \ > > + */ \ > > + seq = smp_load_acquire(&s->seqcount.sequence); \ > > We could probably do even better with LDAPR here, as that should be > sufficient for this. It's a can of worms though, as it's not implemented > on all CPUs and relaxing smp_load_acquire() might introduce subtle > breakage in places where it's used to build other types of lock. Maybe > you can hack something to see if there's any performance left behind > without it? I added the following patch. Kernel booted fine. No change in the cycles of read_seq() LDAPR --------------------------- Test Single 2 CPU 4 CPU 8 CPU 16 CPU 32 CPU 64 CPU ALL write seq : 13 98 385 764 1551 3043 6259 11922 read seq : 8 8 8 8 8 8 9 10 rw seq : 8 101 247 300 467 742 1384 2101 LDA --------------------------- Test Single 2 CPU 4 CPU 8 CPU 16 CPU 32 CPU 64 CPU ALL write seq : 13 90 343 785 1533 3032 6315 11073 read seq : 8 8 8 8 8 8 9 11 rw seq : 8 79 227 313 423 755 1313 2220 Index: linux/arch/arm64/include/asm/barrier.h =================================================================== --- linux.orig/arch/arm64/include/asm/barrier.h +++ linux/arch/arm64/include/asm/barrier.h @@ -167,22 +167,22 @@ do { \ kasan_check_read(__p, sizeof(*p)); \ switch (sizeof(*p)) { \ case 1: \ - asm volatile ("ldarb %w0, %1" \ + asm volatile (".arch_extension rcpc\nldaprb %w0, %1" \ : "=r" (*(__u8 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ case 2: \ - asm volatile ("ldarh %w0, %1" \ + asm volatile (".arch_extension rcpc\nldaprh %w0, %1" \ : "=r" (*(__u16 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ case 4: \ - asm volatile ("ldar %w0, %1" \ + asm volatile (".arch_extension rcpc\nldapr %w0, %1" \ : "=r" (*(__u32 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ case 8: \ - asm volatile ("ldar %0, %1" \ + asm volatile (".arch_extension rcpc\nldapr %0, %1" \ : "=r" (*(__u64 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire 2024-08-23 10:32 ` Will Deacon 2024-08-23 17:56 ` Christoph Lameter (Ampere) @ 2024-08-23 19:38 ` Christoph Lameter (Ampere) 1 sibling, 0 replies; 7+ messages in thread From: Christoph Lameter (Ampere) @ 2024-08-23 19:38 UTC (permalink / raw) To: Will Deacon Cc: Catalin Marinas, Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng, Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel, linux-arch On Fri, 23 Aug 2024, Will Deacon wrote: > > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE > > +#define raw_read_seqcount_begin(s) \ > > +({ \ > > + unsigned _seq; \ > > + \ > > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ > > + cpu_relax(); \ > > It would also be interesting to see whether smp_cond_load_acquire() > performs any better that this loop in the !RT case. The hack to do this follows. Kernel boots but no change in cycles. Also builds a kernel just fine. Another benchmark may be better. All my synthetic tests do is run the function calls in a loop in parallel on multiple cpus. The main effect here may be the reduction of power since the busyloop is no longer required. I would favor a solution like this. But the patch is not clean given the need to get rid of the const attribute with a cast. Index: linux/include/linux/seqlock.h =================================================================== --- linux.orig/include/linux/seqlock.h +++ linux/include/linux/seqlock.h @@ -325,9 +325,9 @@ SEQCOUNT_LOCKNAME(mutex, struct m #define raw_read_seqcount_begin(s) \ ({ \ unsigned _seq; \ + seqcount_t *e = seqprop_ptr((struct seqcount_spinlock *)s); \ \ - while ((_seq = seqprop_sequence_acquire(s)) & 1) \ - cpu_relax(); \ + _seq = smp_cond_load_acquire(&e->sequence, ((e->sequence & 1) == 0)); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ _seq; \ ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire 2024-08-19 18:30 [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire Christoph Lameter via B4 Relay 2024-08-23 10:32 ` Will Deacon @ 2024-08-23 21:05 ` Thomas Gleixner 2024-08-28 17:15 ` Christoph Lameter (Ampere) 1 sibling, 1 reply; 7+ messages in thread From: Thomas Gleixner @ 2024-08-23 21:05 UTC (permalink / raw) To: Christoph Lameter via B4 Relay, Catalin Marinas, Will Deacon, Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng Cc: Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel, linux-arch, Christoph Lameter (Ampere) On Mon, Aug 19 2024 at 11:30, Christoph Lameter via wrote: > @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) > * > * Return: count to be passed to read_seqcount_retry() > */ > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE > +#define raw_read_seqcount_begin(s) \ > +({ \ > + unsigned _seq; \ > + \ > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ > + cpu_relax(); \ > + \ > + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ > + _seq; \ > +}) So this covers only raw_read_seqcount_begin(), but not raw_read_seqcount() which has the same smp_rmb() inside. This all can be done without the extra copies of the counter accessors. Uncompiled patch below. It's a little larger than I initialy wanted to do it, but I had to keep the raw READ_ONCE() for __read_seqcount_begin() to not inflict the smp_load_acquire() to the only usage site in the dcache code. The acquire conditional in __seqprop_load_sequence() is optimized out by the compiler as all of this is macro/__always_inline. Thanks, tglx --- --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -132,6 +132,14 @@ static inline void seqcount_lockdep_read #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) +static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire) +{ + if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) + return smp_load_acquire(&s->sequence); + else + return READ_ONCE(s->sequence); +} + /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t @@ -155,9 +163,10 @@ static __always_inline const seqcount_t } \ \ static __always_inline unsigned \ -__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s, \ + bool acquire) \ { \ - unsigned seq = READ_ONCE(s->seqcount.sequence); \ + unsigned seq = __seqprop_load_sequence(&s->seqcount, acquire); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ @@ -170,7 +179,7 @@ static __always_inline unsigned \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ - seq = READ_ONCE(s->seqcount.sequence); \ + seq = __seqprop_load_sequence(&s->seqcount, acquire); \ } \ \ return seq; \ @@ -206,9 +215,9 @@ static inline const seqcount_t *__seqpro return s; } -static inline unsigned __seqprop_sequence(const seqcount_t *s) +static inline unsigned __seqprop_sequence(const seqcount_t *s, bool acquire) { - return READ_ONCE(s->sequence); + return __seqprop_load_sequence(s, acquire); } static inline bool __seqprop_preemptible(const seqcount_t *s) @@ -258,29 +267,23 @@ SEQCOUNT_LOCKNAME(mutex, struct m #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) -#define seqprop_sequence(s) __seqprop(s, sequence)(s) +#define seqprop_sequence(s, a) __seqprop(s, sequence)(s, a) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) /** - * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier - * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() - * barrier. Callers should ensure that smp_rmb() or equivalent ordering is - * provided before actually loading any of the variables that are to be - * protected in this critical section. - * - * Use carefully, only in critical code, and comment how the barrier is - * provided. + * read_seqcount_begin_cond_acquire() - begin a seqcount_t read section + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * @acquire: If true, the read of the sequence count uses smp_load_acquire() + * if the architecure provides and enabled it. * * Return: count to be passed to read_seqcount_retry() */ -#define __read_seqcount_begin(s) \ +#define read_seqcount_begin_cond_acquire(s, acquire) \ ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while ((__seq = seqprop_sequence(s, acquire)) & 1) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ @@ -288,6 +291,26 @@ SEQCOUNT_LOCKNAME(mutex, struct m }) /** + * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * __read_seqcount_begin is like read_seqcount_begin, but it neither + * provides a smp_rmb() barrier nor does it use smp_load_acquire() on + * architectures which provide it. + * + * Callers should ensure that smp_rmb() or equivalent ordering is provided + * before actually loading any of the variables that are to be protected in + * this critical section. + * + * Use carefully, only in critical code, and comment how the barrier is + * provided. + * + * Return: count to be passed to read_seqcount_retry() + */ +#define __read_seqcount_begin(s) \ + read_seqcount_begin_cond_acquire(s, false) + +/** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @@ -295,9 +318,10 @@ SEQCOUNT_LOCKNAME(mutex, struct m */ #define raw_read_seqcount_begin(s) \ ({ \ - unsigned _seq = __read_seqcount_begin(s); \ + unsigned _seq = read_seqcount_begin_cond_acquire(s, true); \ \ - smp_rmb(); \ + if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) \ + smp_rmb(); \ _seq; \ }) @@ -326,9 +350,10 @@ SEQCOUNT_LOCKNAME(mutex, struct m */ #define raw_read_seqcount(s) \ ({ \ - unsigned __seq = seqprop_sequence(s); \ + unsigned __seq = seqprop_sequence(s, true); \ \ - smp_rmb(); \ + if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) \ + smp_rmb(); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire 2024-08-23 21:05 ` Thomas Gleixner @ 2024-08-28 17:15 ` Christoph Lameter (Ampere) 2024-09-02 11:55 ` Thomas Gleixner 0 siblings, 1 reply; 7+ messages in thread From: Christoph Lameter (Ampere) @ 2024-08-28 17:15 UTC (permalink / raw) To: Thomas Gleixner Cc: Christoph Lameter via B4 Relay, Catalin Marinas, Will Deacon, Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng, Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel, linux-arch On Fri, 23 Aug 2024, Thomas Gleixner wrote: > This all can be done without the extra copies of the counter > accessors. Uncompiled patch below. Great. Thanks. Tried it too initially but could not make it work right. One thing that we also want is the use of the smp_cond_load_acquire to have the cpu power down while waiting for a cacheline change. The code has several places where loops occur when the last bit is set in the seqcount. We could use smp_cond_load_acquire in load_sequence() but what do we do about the loops at the higher level? Also this does not sync with the lock checking logic. diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 68b3af8bd6c6..4442a97ffe9a 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -135,7 +135,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire) { if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) - return smp_load_acquire(&s->sequence); + return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0); else return READ_ONCE(s->sequence); } ^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire 2024-08-28 17:15 ` Christoph Lameter (Ampere) @ 2024-09-02 11:55 ` Thomas Gleixner 0 siblings, 0 replies; 7+ messages in thread From: Thomas Gleixner @ 2024-09-02 11:55 UTC (permalink / raw) To: Christoph Lameter (Ampere) Cc: Christoph Lameter via B4 Relay, Catalin Marinas, Will Deacon, Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng, Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel, linux-arch On Wed, Aug 28 2024 at 10:15, Christoph Lameter wrote: > On Fri, 23 Aug 2024, Thomas Gleixner wrote: > >> This all can be done without the extra copies of the counter >> accessors. Uncompiled patch below. > > Great. Thanks. Tried it too initially but could not make it work right. > > One thing that we also want is the use of the smp_cond_load_acquire to > have the cpu power down while waiting for a cacheline change. > > The code has several places where loops occur when the last bit is set in > the seqcount. > > We could use smp_cond_load_acquire in load_sequence() but what do we do > about the loops at the higher level? Also this does not sync with the lock > checking logic. Come on. It's not rocket science to figure that out. Uncompiled delta patch below. Thanks, tglx --- --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -23,6 +23,13 @@ #include <asm/processor.h> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE +# define USE_LOAD_ACQUIRE true +# define USE_COND_LOAD_ACQUIRE !IS_ENABLED(CONFIG_PREEMPT_RT) +#else +# define USE_LOAD_ACQUIRE false +# define USE_COND_LOAD_ACQUIRE false +#endif /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to @@ -134,10 +141,13 @@ static inline void seqcount_lockdep_read static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire) { - if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) - return smp_load_acquire(&s->sequence); - else + if (!acquire || !USE_LOAD_ACQUIRE) return READ_ONCE(s->sequence); + + if (USE_COND_LOAD_ACQUIRE) + return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0); + + return smp_load_acquire(&s->sequence); } /* @@ -283,8 +293,12 @@ SEQCOUNT_LOCKNAME(mutex, struct m ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s, acquire)) & 1) \ - cpu_relax(); \ + if (acquire && USE_COND_LOAD_ACQUIRE) { \ + __seq = seqprop_sequence(s, acquire); \ + } else { \ + while ((__seq = seqprop_sequence(s, acquire)) & 1) \ + cpu_relax(); \ + } \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-09-02 11:55 UTC | newest] Thread overview: 7+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2024-08-19 18:30 [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire Christoph Lameter via B4 Relay 2024-08-23 10:32 ` Will Deacon 2024-08-23 17:56 ` Christoph Lameter (Ampere) 2024-08-23 19:38 ` Christoph Lameter (Ampere) 2024-08-23 21:05 ` Thomas Gleixner 2024-08-28 17:15 ` Christoph Lameter (Ampere) 2024-09-02 11:55 ` Thomas Gleixner
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).