* [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
@ 2024-08-19 18:30 Christoph Lameter via B4 Relay
2024-08-23 10:32 ` Will Deacon
2024-08-23 21:05 ` Thomas Gleixner
0 siblings, 2 replies; 7+ messages in thread
From: Christoph Lameter via B4 Relay @ 2024-08-19 18:30 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, Peter Zijlstra, Ingo Molnar,
Waiman Long, Boqun Feng
Cc: Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel,
linux-arch, Christoph Lameter (Ampere)
From: "Christoph Lameter (Ampere)" <cl@gentwo.org>
Some architectures support load acquire which can save us a memory
barrier and save some cycles.
A typical sequence
do {
seq = read_seqcount_begin(&s);
<something>
} while (read_seqcount_retry(&s, seq);
requires 13 cycles on ARM64 for an empty loop. Two read memory
barriers are needed. One for each of the seqcount_* functions.
We can replace the first read barrier with a load acquire of
the seqcount which saves us one barrier.
On ARM64 doing so reduces the cycle count from 13 to 8.
This is a general improvement for the ARM64 architecture and not
specific to a certain processor. The cycle count here was
obtained on a Neoverse N1 (Ampere Altra).
The ARM documentation states that load acquire is more effective
than a load plus barrier. In general that tends to be true on all
compute platforms that support both.
See (as quoted by Linus Torvalds):
https://developer.arm.com/documentation/102336/0100/Load-Acquire-and-Store-Release-instructions
"Weaker ordering requirements that are imposed by Load-Acquire and
Store-Release instructions allow for micro-architectural
optimizations, which could reduce some of the performance impacts that
are otherwise imposed by an explicit memory barrier.
If the ordering requirement is satisfied using either a Load-Acquire
or Store-Release, then it would be preferable to use these
instructions instead of a DMB"
Signed-off-by: Christoph Lameter (Ampere) <cl@gentwo.org>
---
V1->V2
- Describe the benefit of load acquire vs barriers
- Explain the CONFIG_ARCH_HAS_ACQUIRE_RELEASE option better
---
arch/Kconfig | 8 ++++++++
arch/arm64/Kconfig | 1 +
include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 50 insertions(+)
diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd..3c270f496231 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1600,6 +1600,14 @@ config ARCH_HAS_KERNEL_FPU_SUPPORT
Architectures that select this option can run floating-point code in
the kernel, as described in Documentation/core-api/floating-point.rst.
+config ARCH_HAS_ACQUIRE_RELEASE
+ bool
+ help
+ Setting ARCH_HAS_ACQUIRE_RELEASE indicates that the architecture
+ supports load acquire and release. Typically these are more effective
+ than memory barriers. Code will prefer the use of load acquire and
+ store release over memory barriers if this option is enabled.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca6..19e34fff145f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -39,6 +39,7 @@ config ARM64
select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_HW_PTE_YOUNG
+ select ARCH_HAS_ACQUIRE_RELEASE
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d90d8ee29d81..353fcf32b800 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
return seq; \
} \
\
+static __always_inline unsigned \
+__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
+{ \
+ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \
+ \
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
+ return seq; \
+ \
+ if (preemptible && unlikely(seq & 1)) { \
+ __SEQ_LOCK(lockbase##_lock(s->lock)); \
+ __SEQ_LOCK(lockbase##_unlock(s->lock)); \
+ \
+ /* \
+ * Re-read the sequence counter since the (possibly \
+ * preempted) writer made progress. \
+ */ \
+ seq = smp_load_acquire(&s->seqcount.sequence); \
+ } \
+ \
+ return seq; \
+} \
+ \
static __always_inline bool \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \
{ \
@@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s)
return READ_ONCE(s->sequence);
}
+static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s)
+{
+ return smp_load_acquire(&s->sequence);
+}
+
static inline bool __seqprop_preemptible(const seqcount_t *s)
{
return false;
@@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
#define seqprop_ptr(s) __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s) __seqprop(s, sequence)(s)
+#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s)
#define seqprop_preemptible(s) __seqprop(s, preemptible)(s)
#define seqprop_assert(s) __seqprop(s, assert)(s)
@@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
*
* Return: count to be passed to read_seqcount_retry()
*/
+#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
+#define raw_read_seqcount_begin(s) \
+({ \
+ unsigned _seq; \
+ \
+ while ((_seq = seqprop_sequence_acquire(s)) & 1) \
+ cpu_relax(); \
+ \
+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
+ _seq; \
+})
+#else
#define raw_read_seqcount_begin(s) \
({ \
unsigned _seq = __read_seqcount_begin(s); \
@@ -300,6 +340,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
smp_rmb(); \
_seq; \
})
+#endif
/**
* read_seqcount_begin() - begin a seqcount_t read critical section
---
base-commit: b0da640826ba3b6506b4996a6b23a429235e6923
change-id: 20240813-seq_optimize-68c48696c798
Best regards,
--
Christoph Lameter <cl@gentwo.org>
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
2024-08-19 18:30 [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire Christoph Lameter via B4 Relay
@ 2024-08-23 10:32 ` Will Deacon
2024-08-23 17:56 ` Christoph Lameter (Ampere)
2024-08-23 19:38 ` Christoph Lameter (Ampere)
2024-08-23 21:05 ` Thomas Gleixner
1 sibling, 2 replies; 7+ messages in thread
From: Will Deacon @ 2024-08-23 10:32 UTC (permalink / raw)
To: cl
Cc: Catalin Marinas, Peter Zijlstra, Ingo Molnar, Waiman Long,
Boqun Feng, Linus Torvalds, linux-mm, linux-kernel,
linux-arm-kernel, linux-arch
On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote:
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index d90d8ee29d81..353fcf32b800 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
> return seq; \
> } \
> \
> +static __always_inline unsigned \
> +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
> +{ \
> + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \
> + \
> + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
> + return seq; \
> + \
> + if (preemptible && unlikely(seq & 1)) { \
> + __SEQ_LOCK(lockbase##_lock(s->lock)); \
> + __SEQ_LOCK(lockbase##_unlock(s->lock)); \
> + \
> + /* \
> + * Re-read the sequence counter since the (possibly \
> + * preempted) writer made progress. \
> + */ \
> + seq = smp_load_acquire(&s->seqcount.sequence); \
We could probably do even better with LDAPR here, as that should be
sufficient for this. It's a can of worms though, as it's not implemented
on all CPUs and relaxing smp_load_acquire() might introduce subtle
breakage in places where it's used to build other types of lock. Maybe
you can hack something to see if there's any performance left behind
without it?
> + } \
> + \
> + return seq; \
> +} \
> + \
> static __always_inline bool \
> __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \
> { \
> @@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s)
> return READ_ONCE(s->sequence);
> }
>
> +static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s)
> +{
> + return smp_load_acquire(&s->sequence);
> +}
> +
> static inline bool __seqprop_preemptible(const seqcount_t *s)
> {
> return false;
> @@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
> #define seqprop_ptr(s) __seqprop(s, ptr)(s)
> #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s)
> #define seqprop_sequence(s) __seqprop(s, sequence)(s)
> +#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s)
> #define seqprop_preemptible(s) __seqprop(s, preemptible)(s)
> #define seqprop_assert(s) __seqprop(s, assert)(s)
>
> @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
> *
> * Return: count to be passed to read_seqcount_retry()
> */
> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> +#define raw_read_seqcount_begin(s) \
> +({ \
> + unsigned _seq; \
> + \
> + while ((_seq = seqprop_sequence_acquire(s)) & 1) \
> + cpu_relax(); \
It would also be interesting to see whether smp_cond_load_acquire()
performs any better that this loop in the !RT case.
Both things to look at separately though, so:
Acked-by: Will Deacon <will@kernel.org>
I assume this will go via -tip.
Will
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
2024-08-23 10:32 ` Will Deacon
@ 2024-08-23 17:56 ` Christoph Lameter (Ampere)
2024-08-23 19:38 ` Christoph Lameter (Ampere)
1 sibling, 0 replies; 7+ messages in thread
From: Christoph Lameter (Ampere) @ 2024-08-23 17:56 UTC (permalink / raw)
To: Will Deacon
Cc: Catalin Marinas, Peter Zijlstra, Ingo Molnar, Waiman Long,
Boqun Feng, Linus Torvalds, linux-mm, linux-kernel,
linux-arm-kernel, linux-arch
On Fri, 23 Aug 2024, Will Deacon wrote:
> On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote:
> > +static __always_inline unsigned \
> > +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
> > +{ \
> > + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \
> > + \
> > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
> > + return seq; \
> > + \
> > + if (preemptible && unlikely(seq & 1)) { \
> > + __SEQ_LOCK(lockbase##_lock(s->lock)); \
> > + __SEQ_LOCK(lockbase##_unlock(s->lock)); \
> > + \
> > + /* \
> > + * Re-read the sequence counter since the (possibly \
> > + * preempted) writer made progress. \
> > + */ \
> > + seq = smp_load_acquire(&s->seqcount.sequence); \
>
> We could probably do even better with LDAPR here, as that should be
> sufficient for this. It's a can of worms though, as it's not implemented
> on all CPUs and relaxing smp_load_acquire() might introduce subtle
> breakage in places where it's used to build other types of lock. Maybe
> you can hack something to see if there's any performance left behind
> without it?
I added the following patch. Kernel booted fine. No change in the cycles
of read_seq()
LDAPR
---------------------------
Test Single 2 CPU 4 CPU 8 CPU 16 CPU 32 CPU 64 CPU ALL
write seq : 13 98 385 764 1551 3043 6259 11922
read seq : 8 8 8 8 8 8 9 10
rw seq : 8 101 247 300 467 742 1384 2101
LDA
---------------------------
Test Single 2 CPU 4 CPU 8 CPU 16 CPU 32 CPU 64 CPU ALL
write seq : 13 90 343 785 1533 3032 6315 11073
read seq : 8 8 8 8 8 8 9 11
rw seq : 8 79 227 313 423 755 1313 2220
Index: linux/arch/arm64/include/asm/barrier.h
===================================================================
--- linux.orig/arch/arm64/include/asm/barrier.h
+++ linux/arch/arm64/include/asm/barrier.h
@@ -167,22 +167,22 @@ do { \
kasan_check_read(__p, sizeof(*p)); \
switch (sizeof(*p)) { \
case 1: \
- asm volatile ("ldarb %w0, %1" \
+ asm volatile (".arch_extension rcpc\nldaprb %w0, %1" \
: "=r" (*(__u8 *)__u.__c) \
: "Q" (*__p) : "memory"); \
break; \
case 2: \
- asm volatile ("ldarh %w0, %1" \
+ asm volatile (".arch_extension rcpc\nldaprh %w0, %1" \
: "=r" (*(__u16 *)__u.__c) \
: "Q" (*__p) : "memory"); \
break; \
case 4: \
- asm volatile ("ldar %w0, %1" \
+ asm volatile (".arch_extension rcpc\nldapr %w0, %1" \
: "=r" (*(__u32 *)__u.__c) \
: "Q" (*__p) : "memory"); \
break; \
case 8: \
- asm volatile ("ldar %0, %1" \
+ asm volatile (".arch_extension rcpc\nldapr %0, %1" \
: "=r" (*(__u64 *)__u.__c) \
: "Q" (*__p) : "memory"); \
break; \
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
2024-08-23 10:32 ` Will Deacon
2024-08-23 17:56 ` Christoph Lameter (Ampere)
@ 2024-08-23 19:38 ` Christoph Lameter (Ampere)
1 sibling, 0 replies; 7+ messages in thread
From: Christoph Lameter (Ampere) @ 2024-08-23 19:38 UTC (permalink / raw)
To: Will Deacon
Cc: Catalin Marinas, Peter Zijlstra, Ingo Molnar, Waiman Long,
Boqun Feng, Linus Torvalds, linux-mm, linux-kernel,
linux-arm-kernel, linux-arch
On Fri, 23 Aug 2024, Will Deacon wrote:
> > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> > +#define raw_read_seqcount_begin(s) \
> > +({ \
> > + unsigned _seq; \
> > + \
> > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \
> > + cpu_relax(); \
>
> It would also be interesting to see whether smp_cond_load_acquire()
> performs any better that this loop in the !RT case.
The hack to do this follows. Kernel boots but no change in cycles. Also
builds a kernel just fine.
Another benchmark may be better. All my synthetic tests do is run the
function calls in a loop in parallel on multiple cpus.
The main effect here may be the reduction of power since the busyloop is
no longer required. I would favor a solution like this. But the patch is
not clean given the need to get rid of the const attribute with a cast.
Index: linux/include/linux/seqlock.h
===================================================================
--- linux.orig/include/linux/seqlock.h
+++ linux/include/linux/seqlock.h
@@ -325,9 +325,9 @@ SEQCOUNT_LOCKNAME(mutex, struct m
#define raw_read_seqcount_begin(s) \
({ \
unsigned _seq; \
+ seqcount_t *e = seqprop_ptr((struct seqcount_spinlock *)s); \
\
- while ((_seq = seqprop_sequence_acquire(s)) & 1) \
- cpu_relax(); \
+ _seq = smp_cond_load_acquire(&e->sequence, ((e->sequence & 1) == 0)); \
\
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
_seq; \
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
2024-08-19 18:30 [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire Christoph Lameter via B4 Relay
2024-08-23 10:32 ` Will Deacon
@ 2024-08-23 21:05 ` Thomas Gleixner
2024-08-28 17:15 ` Christoph Lameter (Ampere)
1 sibling, 1 reply; 7+ messages in thread
From: Thomas Gleixner @ 2024-08-23 21:05 UTC (permalink / raw)
To: Christoph Lameter via B4 Relay, Catalin Marinas, Will Deacon,
Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng
Cc: Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel,
linux-arch, Christoph Lameter (Ampere)
On Mon, Aug 19 2024 at 11:30, Christoph Lameter via wrote:
> @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
> *
> * Return: count to be passed to read_seqcount_retry()
> */
> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> +#define raw_read_seqcount_begin(s) \
> +({ \
> + unsigned _seq; \
> + \
> + while ((_seq = seqprop_sequence_acquire(s)) & 1) \
> + cpu_relax(); \
> + \
> + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
> + _seq; \
> +})
So this covers only raw_read_seqcount_begin(), but not
raw_read_seqcount() which has the same smp_rmb() inside.
This all can be done without the extra copies of the counter
accessors. Uncompiled patch below.
It's a little larger than I initialy wanted to do it, but I had to keep
the raw READ_ONCE() for __read_seqcount_begin() to not inflict the
smp_load_acquire() to the only usage site in the dcache code.
The acquire conditional in __seqprop_load_sequence() is optimized out by
the compiler as all of this is macro/__always_inline.
Thanks,
tglx
---
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -132,6 +132,14 @@ static inline void seqcount_lockdep_read
#define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex)
+static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire)
+{
+ if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))
+ return smp_load_acquire(&s->sequence);
+ else
+ return READ_ONCE(s->sequence);
+}
+
/*
* SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers
* seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t
@@ -155,9 +163,10 @@ static __always_inline const seqcount_t
} \
\
static __always_inline unsigned \
-__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
+__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s, \
+ bool acquire) \
{ \
- unsigned seq = READ_ONCE(s->seqcount.sequence); \
+ unsigned seq = __seqprop_load_sequence(&s->seqcount, acquire); \
\
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
return seq; \
@@ -170,7 +179,7 @@ static __always_inline unsigned \
* Re-read the sequence counter since the (possibly \
* preempted) writer made progress. \
*/ \
- seq = READ_ONCE(s->seqcount.sequence); \
+ seq = __seqprop_load_sequence(&s->seqcount, acquire); \
} \
\
return seq; \
@@ -206,9 +215,9 @@ static inline const seqcount_t *__seqpro
return s;
}
-static inline unsigned __seqprop_sequence(const seqcount_t *s)
+static inline unsigned __seqprop_sequence(const seqcount_t *s, bool acquire)
{
- return READ_ONCE(s->sequence);
+ return __seqprop_load_sequence(s, acquire);
}
static inline bool __seqprop_preemptible(const seqcount_t *s)
@@ -258,29 +267,23 @@ SEQCOUNT_LOCKNAME(mutex, struct m
#define seqprop_ptr(s) __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s)
-#define seqprop_sequence(s) __seqprop(s, sequence)(s)
+#define seqprop_sequence(s, a) __seqprop(s, sequence)(s, a)
#define seqprop_preemptible(s) __seqprop(s, preemptible)(s)
#define seqprop_assert(s) __seqprop(s, assert)(s)
/**
- * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
- * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
- *
- * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
- * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
- * provided before actually loading any of the variables that are to be
- * protected in this critical section.
- *
- * Use carefully, only in critical code, and comment how the barrier is
- * provided.
+ * read_seqcount_begin_cond_acquire() - begin a seqcount_t read section
+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ * @acquire: If true, the read of the sequence count uses smp_load_acquire()
+ * if the architecure provides and enabled it.
*
* Return: count to be passed to read_seqcount_retry()
*/
-#define __read_seqcount_begin(s) \
+#define read_seqcount_begin_cond_acquire(s, acquire) \
({ \
unsigned __seq; \
\
- while ((__seq = seqprop_sequence(s)) & 1) \
+ while ((__seq = seqprop_sequence(s, acquire)) & 1) \
cpu_relax(); \
\
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
@@ -288,6 +291,26 @@ SEQCOUNT_LOCKNAME(mutex, struct m
})
/**
+ * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ *
+ * __read_seqcount_begin is like read_seqcount_begin, but it neither
+ * provides a smp_rmb() barrier nor does it use smp_load_acquire() on
+ * architectures which provide it.
+ *
+ * Callers should ensure that smp_rmb() or equivalent ordering is provided
+ * before actually loading any of the variables that are to be protected in
+ * this critical section.
+ *
+ * Use carefully, only in critical code, and comment how the barrier is
+ * provided.
+ *
+ * Return: count to be passed to read_seqcount_retry()
+ */
+#define __read_seqcount_begin(s) \
+ read_seqcount_begin_cond_acquire(s, false)
+
+/**
* raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
@@ -295,9 +318,10 @@ SEQCOUNT_LOCKNAME(mutex, struct m
*/
#define raw_read_seqcount_begin(s) \
({ \
- unsigned _seq = __read_seqcount_begin(s); \
+ unsigned _seq = read_seqcount_begin_cond_acquire(s, true); \
\
- smp_rmb(); \
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) \
+ smp_rmb(); \
_seq; \
})
@@ -326,9 +350,10 @@ SEQCOUNT_LOCKNAME(mutex, struct m
*/
#define raw_read_seqcount(s) \
({ \
- unsigned __seq = seqprop_sequence(s); \
+ unsigned __seq = seqprop_sequence(s, true); \
\
- smp_rmb(); \
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) \
+ smp_rmb(); \
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
__seq; \
})
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
2024-08-23 21:05 ` Thomas Gleixner
@ 2024-08-28 17:15 ` Christoph Lameter (Ampere)
2024-09-02 11:55 ` Thomas Gleixner
0 siblings, 1 reply; 7+ messages in thread
From: Christoph Lameter (Ampere) @ 2024-08-28 17:15 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Christoph Lameter via B4 Relay, Catalin Marinas, Will Deacon,
Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng,
Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel,
linux-arch
On Fri, 23 Aug 2024, Thomas Gleixner wrote:
> This all can be done without the extra copies of the counter
> accessors. Uncompiled patch below.
Great. Thanks. Tried it too initially but could not make it work right.
One thing that we also want is the use of the smp_cond_load_acquire to
have the cpu power down while waiting for a cacheline change.
The code has several places where loops occur when the last bit is set in
the seqcount.
We could use smp_cond_load_acquire in load_sequence() but what do we do
about the loops at the higher level? Also this does not sync with the lock
checking logic.
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 68b3af8bd6c6..4442a97ffe9a 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -135,7 +135,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire)
{
if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))
- return smp_load_acquire(&s->sequence);
+ return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0);
else
return READ_ONCE(s->sequence);
}
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire
2024-08-28 17:15 ` Christoph Lameter (Ampere)
@ 2024-09-02 11:55 ` Thomas Gleixner
0 siblings, 0 replies; 7+ messages in thread
From: Thomas Gleixner @ 2024-09-02 11:55 UTC (permalink / raw)
To: Christoph Lameter (Ampere)
Cc: Christoph Lameter via B4 Relay, Catalin Marinas, Will Deacon,
Peter Zijlstra, Ingo Molnar, Waiman Long, Boqun Feng,
Linus Torvalds, linux-mm, linux-kernel, linux-arm-kernel,
linux-arch
On Wed, Aug 28 2024 at 10:15, Christoph Lameter wrote:
> On Fri, 23 Aug 2024, Thomas Gleixner wrote:
>
>> This all can be done without the extra copies of the counter
>> accessors. Uncompiled patch below.
>
> Great. Thanks. Tried it too initially but could not make it work right.
>
> One thing that we also want is the use of the smp_cond_load_acquire to
> have the cpu power down while waiting for a cacheline change.
>
> The code has several places where loops occur when the last bit is set in
> the seqcount.
>
> We could use smp_cond_load_acquire in load_sequence() but what do we do
> about the loops at the higher level? Also this does not sync with the lock
> checking logic.
Come on. It's not rocket science to figure that out.
Uncompiled delta patch below.
Thanks,
tglx
---
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -23,6 +23,13 @@
#include <asm/processor.h>
+#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
+# define USE_LOAD_ACQUIRE true
+# define USE_COND_LOAD_ACQUIRE !IS_ENABLED(CONFIG_PREEMPT_RT)
+#else
+# define USE_LOAD_ACQUIRE false
+# define USE_COND_LOAD_ACQUIRE false
+#endif
/*
* The seqlock seqcount_t interface does not prescribe a precise sequence of
* read begin/retry/end. For readers, typically there is a call to
@@ -134,10 +141,13 @@ static inline void seqcount_lockdep_read
static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire)
{
- if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))
- return smp_load_acquire(&s->sequence);
- else
+ if (!acquire || !USE_LOAD_ACQUIRE)
return READ_ONCE(s->sequence);
+
+ if (USE_COND_LOAD_ACQUIRE)
+ return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0);
+
+ return smp_load_acquire(&s->sequence);
}
/*
@@ -283,8 +293,12 @@ SEQCOUNT_LOCKNAME(mutex, struct m
({ \
unsigned __seq; \
\
- while ((__seq = seqprop_sequence(s, acquire)) & 1) \
- cpu_relax(); \
+ if (acquire && USE_COND_LOAD_ACQUIRE) { \
+ __seq = seqprop_sequence(s, acquire); \
+ } else { \
+ while ((__seq = seqprop_sequence(s, acquire)) & 1) \
+ cpu_relax(); \
+ } \
\
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
__seq; \
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-09-02 11:55 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-19 18:30 [PATCH v2] Avoid memory barrier in read_seqcount() through load acquire Christoph Lameter via B4 Relay
2024-08-23 10:32 ` Will Deacon
2024-08-23 17:56 ` Christoph Lameter (Ampere)
2024-08-23 19:38 ` Christoph Lameter (Ampere)
2024-08-23 21:05 ` Thomas Gleixner
2024-08-28 17:15 ` Christoph Lameter (Ampere)
2024-09-02 11:55 ` Thomas Gleixner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).