[PATCH RFC] locking: Add volatile to arch_spinlock

public inbox for linux-arch@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
@ 2014-12-04  6:20 Paul E. McKenney
       [not found] ` <CA+55aFzn-6asfWHB8SAvz4GJWz7uEriujgVLfaqoo_VPtNBLuA@mail.gmail.com>
  0 siblings, 1 reply; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04  6:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-arch, dvyukov, dave, mingo, peterz, torvalds

More concern about compilers...

Most architectures mark the fields in their arch_spinlock_t structures
as volatile, which forces the compiler to respect the integrity of
those fields.  Without volatile markings, the compiler is within its
rights to overwrite these fields, for example, by using a wide store
to update an adjacent field as long as it fixes them up later.  This
sort of thing could come as a rather nasty surprise to any task attempting
to concurrently acquire that lock.

For example, on x86 for smallish NR_CPUS, arch_spinlock_t is 16 bits wide.
If there were three adjacent 16-bit fields in the structure containing
the arch_spinlock_t, a compiler might reasonably use 64-bit accesses
for those three fields.  After a 64-bit load, the arch_spinlock_t's
value would be available in a register, so that the compiler might use
a 64-bit store followed by a 16-bit fixup store to update the three
adjacent 16-bit fields.

This commit therefore adds volatile to the arch_spinlock_t and
arch_rwlock_t fields that don't already have them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: linux-arch@vger.kernel.org
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
index 47663fcb10ad..7d3b6ecb5301 100644
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -9,14 +9,14 @@
 
 typedef struct {
 	union {
-		u32 slock;
+		volatile u32 slock;
 		struct __raw_tickets {
 #ifdef __ARMEB__
-			u16 next;
-			u16 owner;
+			volatile u16 next;
+			volatile u16 owner;
 #else
-			u16 owner;
-			u16 next;
+			volatile u16 owner;
+			volatile u16 next;
 #endif
 		} tickets;
 	};
@@ -25,7 +25,7 @@ typedef struct {
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
 typedef struct {
-	u32 lock;
+	volatile u32 lock;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h
index b8d383665f56..0f841378f0f3 100644
--- a/arch/arm64/include/asm/spinlock_types.h
+++ b/arch/arm64/include/asm/spinlock_types.h
@@ -24,11 +24,11 @@
 
 typedef struct {
 #ifdef __AARCH64EB__
-	u16 next;
-	u16 owner;
+	volatile u16 next;
+	volatile u16 owner;
 #else
-	u16 owner;
-	u16 next;
+	volatile u16 owner;
+	volatile u16 next;
 #endif
 } __aligned(4) arch_spinlock_t;
 
diff --git a/arch/mips/include/asm/spinlock_types.h b/arch/mips/include/asm/spinlock_types.h
index 9b2528e612c0..10f04a0482dd 100644
--- a/arch/mips/include/asm/spinlock_types.h
+++ b/arch/mips/include/asm/spinlock_types.h
@@ -14,14 +14,14 @@ typedef union {
 	 * bits	 0..15 : serving_now
 	 * bits 16..31 : ticket
 	 */
-	u32 lock;
+	volatile u32 lock;
 	struct {
 #ifdef __BIG_ENDIAN
-		u16 ticket;
-		u16 serving_now;
+		volatile u16 ticket;
+		volatile u16 serving_now;
 #else
-		u16 serving_now;
-		u16 ticket;
+		volatile u16 serving_now;
+		volatile u16 ticket;
 #endif
 	} h;
 } arch_spinlock_t;
diff --git a/arch/mn10300/include/asm/spinlock_types.h b/arch/mn10300/include/asm/spinlock_types.h
index 653dc519b405..04fd2c622f81 100644
--- a/arch/mn10300/include/asm/spinlock_types.h
+++ b/arch/mn10300/include/asm/spinlock_types.h
@@ -6,13 +6,13 @@
 #endif
 
 typedef struct arch_spinlock {
-	unsigned int slock;
+	volatile unsigned int slock;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
-	unsigned int lock;
+	volatile unsigned int lock;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index d84b6939237c..0ccdda3b6842 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -6,14 +6,14 @@
 #endif
 
 typedef struct {
-	unsigned int lock;
+	volatile unsigned int lock;
 } __attribute__ ((aligned (4))) arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, }
 
 typedef struct {
-	unsigned int lock;
-	unsigned int owner;
+	volatile unsigned int lock;
+	volatile unsigned int owner;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
diff --git a/arch/tile/include/asm/spinlock_types.h b/arch/tile/include/asm/spinlock_types.h
index a71f59b49c50..29f70a14b979 100644
--- a/arch/tile/include/asm/spinlock_types.h
+++ b/arch/tile/include/asm/spinlock_types.h
@@ -23,14 +23,14 @@
 
 /* Low 15 bits are "next"; high 15 bits are "current". */
 typedef struct arch_spinlock {
-	unsigned int lock;
+	volatile unsigned int lock;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
 
 /* High bit is "writer owns"; low 31 bits are a count of readers. */
 typedef struct arch_rwlock {
-	unsigned int lock;
+	volatile unsigned int lock;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
@@ -39,9 +39,9 @@ typedef struct arch_rwlock {
 
 typedef struct arch_spinlock {
 	/* Next ticket number to hand out. */
-	int next_ticket;
+	volatile int next_ticket;
 	/* The ticket number that currently owns this lock. */
-	int current_ticket;
+	volatile int current_ticket;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0, 0 }
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 5f9d7572d82b..fead0ca82468 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -25,9 +25,9 @@ typedef u32 __ticketpair_t;
 
 typedef struct arch_spinlock {
 	union {
-		__ticketpair_t head_tail;
+		volatile __ticketpair_t head_tail;
 		struct __raw_tickets {
-			__ticket_t head, tail;
+			volatile __ticket_t head, tail;
 		} tickets;
 	};
 } arch_spinlock_t;

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
       [not found] ` <CA+55aFzn-6asfWHB8SAvz4GJWz7uEriujgVLfaqoo_VPtNBLuA@mail.gmail.com>
@ 2014-12-04  6:57   ` Paul E. McKenney
       [not found]   ` <CA+55aFxGCxQNE4HzRP_Uk2FmFDn_+Hfm6GBz75S+5+SDeODVJQ@mail.gmail.com>
  1 sibling, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04  6:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, linux-arch, dave, peterz, dvyukov, linux-kernel

One point of the patch is "one more thing to watch for" in generated code,
namely temporary clobbering of synchronization variables, locks included,
due to overzealous optimization.  If this happens to the kernel, I guess
that the other quick workaround is to add alignment directives or padding.
(And yes, I have seen reports of non-kernel examples where gcc actually
does this sort of thing.)

But I don't feel strongly about this patch, not yet anyway, so will set
it aside for the time being.

							Thanx, Paul

On Wed, Dec 03, 2014 at 10:31:06PM -0800, Linus Torvalds wrote:
> NAK. Volatile on data structures (as opposed to in code) is a disease. This
> is wrong.
> 
> We very much pass spinlock values around. They aren't volatile. The only
> thing volatile is when you load the value from the lock itself. See for
> example the stuff we do when we load a reflock from memory. Once we've
> loaded the value into registers it's no longer volatile, and putting
> violators into the data structures is actively *wrong*.
> 
> So no, no, no. C got this wrong. Volatile data structures are a fundamental
> mistake and a bug. The only good volatile is in an access (ie like
> ACCESS_ONCE() and anything else is making things worse.
> 
> We're not catering to crazy compiler people that can't understand that
> fact. I'm not even seeing the point of your patch, since it seems to be
> entirely about "let's make excuses for crap compilers"
> 
>    Linus
> On Dec 3, 2014 10:20 PM, "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> wrote:
> 
> > More concern about compilers...
> >
> > Most architectures mark the fields in their arch_spinlock_t structures
> > as volatile, which forces the compiler to respect the integrity of
> > those fields.  Without volatile markings, the compiler is within its
> > rights to overwrite these fields, for example, by using a wide store
> > to update an adjacent field as long as it fixes them up later.  This
> > sort of thing could come as a rather nasty surprise to any task attempting
> > to concurrently acquire that lock.
> >
> > For example, on x86 for smallish NR_CPUS, arch_spinlock_t is 16 bits wide.
> > If there were three adjacent 16-bit fields in the structure containing
> > the arch_spinlock_t, a compiler might reasonably use 64-bit accesses
> > for those three fields.  After a 64-bit load, the arch_spinlock_t's
> > value would be available in a register, so that the compiler might use
> > a 64-bit store followed by a 16-bit fixup store to update the three
> > adjacent 16-bit fields.
> >
> > This commit therefore adds volatile to the arch_spinlock_t and
> > arch_rwlock_t fields that don't already have them.
> >
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > Cc: linux-arch@vger.kernel.org
> > Cc: Dmitry Vyukov <dvyukov@google.com>
> > Cc: Davidlohr Bueso <dave@stgolabs.net>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Linus Torvalds <torvalds@linux-foundation.org>
> >
> > diff --git a/arch/arm/include/asm/spinlock_types.h
> > b/arch/arm/include/asm/spinlock_types.h
> > index 47663fcb10ad..7d3b6ecb5301 100644
> > --- a/arch/arm/include/asm/spinlock_types.h
> > +++ b/arch/arm/include/asm/spinlock_types.h
> > @@ -9,14 +9,14 @@
> >
> >  typedef struct {
> >         union {
> > -               u32 slock;
> > +               volatile u32 slock;
> >                 struct __raw_tickets {
> >  #ifdef __ARMEB__
> > -                       u16 next;
> > -                       u16 owner;
> > +                       volatile u16 next;
> > +                       volatile u16 owner;
> >  #else
> > -                       u16 owner;
> > -                       u16 next;
> > +                       volatile u16 owner;
> > +                       volatile u16 next;
> >  #endif
> >                 } tickets;
> >         };
> > @@ -25,7 +25,7 @@ typedef struct {
> >  #define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
> >
> >  typedef struct {
> > -       u32 lock;
> > +       volatile u32 lock;
> >  } arch_rwlock_t;
> >
> >  #define __ARCH_RW_LOCK_UNLOCKED                { 0 }
> > diff --git a/arch/arm64/include/asm/spinlock_types.h
> > b/arch/arm64/include/asm/spinlock_types.h
> > index b8d383665f56..0f841378f0f3 100644
> > --- a/arch/arm64/include/asm/spinlock_types.h
> > +++ b/arch/arm64/include/asm/spinlock_types.h
> > @@ -24,11 +24,11 @@
> >
> >  typedef struct {
> >  #ifdef __AARCH64EB__
> > -       u16 next;
> > -       u16 owner;
> > +       volatile u16 next;
> > +       volatile u16 owner;
> >  #else
> > -       u16 owner;
> > -       u16 next;
> > +       volatile u16 owner;
> > +       volatile u16 next;
> >  #endif
> >  } __aligned(4) arch_spinlock_t;
> >
> > diff --git a/arch/mips/include/asm/spinlock_types.h
> > b/arch/mips/include/asm/spinlock_types.h
> > index 9b2528e612c0..10f04a0482dd 100644
> > --- a/arch/mips/include/asm/spinlock_types.h
> > +++ b/arch/mips/include/asm/spinlock_types.h
> > @@ -14,14 +14,14 @@ typedef union {
> >          * bits  0..15 : serving_now
> >          * bits 16..31 : ticket
> >          */
> > -       u32 lock;
> > +       volatile u32 lock;
> >         struct {
> >  #ifdef __BIG_ENDIAN
> > -               u16 ticket;
> > -               u16 serving_now;
> > +               volatile u16 ticket;
> > +               volatile u16 serving_now;
> >  #else
> > -               u16 serving_now;
> > -               u16 ticket;
> > +               volatile u16 serving_now;
> > +               volatile u16 ticket;
> >  #endif
> >         } h;
> >  } arch_spinlock_t;
> > diff --git a/arch/mn10300/include/asm/spinlock_types.h
> > b/arch/mn10300/include/asm/spinlock_types.h
> > index 653dc519b405..04fd2c622f81 100644
> > --- a/arch/mn10300/include/asm/spinlock_types.h
> > +++ b/arch/mn10300/include/asm/spinlock_types.h
> > @@ -6,13 +6,13 @@
> >  #endif
> >
> >  typedef struct arch_spinlock {
> > -       unsigned int slock;
> > +       volatile unsigned int slock;
> >  } arch_spinlock_t;
> >
> >  #define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
> >
> >  typedef struct {
> > -       unsigned int lock;
> > +       volatile unsigned int lock;
> >  } arch_rwlock_t;
> >
> >  #define __ARCH_RW_LOCK_UNLOCKED                { RW_LOCK_BIAS }
> > diff --git a/arch/s390/include/asm/spinlock_types.h
> > b/arch/s390/include/asm/spinlock_types.h
> > index d84b6939237c..0ccdda3b6842 100644
> > --- a/arch/s390/include/asm/spinlock_types.h
> > +++ b/arch/s390/include/asm/spinlock_types.h
> > @@ -6,14 +6,14 @@
> >  #endif
> >
> >  typedef struct {
> > -       unsigned int lock;
> > +       volatile unsigned int lock;
> >  } __attribute__ ((aligned (4))) arch_spinlock_t;
> >
> >  #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, }
> >
> >  typedef struct {
> > -       unsigned int lock;
> > -       unsigned int owner;
> > +       volatile unsigned int lock;
> > +       volatile unsigned int owner;
> >  } arch_rwlock_t;
> >
> >  #define __ARCH_RW_LOCK_UNLOCKED                { 0 }
> > diff --git a/arch/tile/include/asm/spinlock_types.h
> > b/arch/tile/include/asm/spinlock_types.h
> > index a71f59b49c50..29f70a14b979 100644
> > --- a/arch/tile/include/asm/spinlock_types.h
> > +++ b/arch/tile/include/asm/spinlock_types.h
> > @@ -23,14 +23,14 @@
> >
> >  /* Low 15 bits are "next"; high 15 bits are "current". */
> >  typedef struct arch_spinlock {
> > -       unsigned int lock;
> > +       volatile unsigned int lock;
> >  } arch_spinlock_t;
> >
> >  #define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
> >
> >  /* High bit is "writer owns"; low 31 bits are a count of readers. */
> >  typedef struct arch_rwlock {
> > -       unsigned int lock;
> > +       volatile unsigned int lock;
> >  } arch_rwlock_t;
> >
> >  #define __ARCH_RW_LOCK_UNLOCKED                { 0 }
> > @@ -39,9 +39,9 @@ typedef struct arch_rwlock {
> >
> >  typedef struct arch_spinlock {
> >         /* Next ticket number to hand out. */
> > -       int next_ticket;
> > +       volatile int next_ticket;
> >         /* The ticket number that currently owns this lock. */
> > -       int current_ticket;
> > +       volatile int current_ticket;
> >  } arch_spinlock_t;
> >
> >  #define __ARCH_SPIN_LOCK_UNLOCKED      { 0, 0 }
> > diff --git a/arch/x86/include/asm/spinlock_types.h
> > b/arch/x86/include/asm/spinlock_types.h
> > index 5f9d7572d82b..fead0ca82468 100644
> > --- a/arch/x86/include/asm/spinlock_types.h
> > +++ b/arch/x86/include/asm/spinlock_types.h
> > @@ -25,9 +25,9 @@ typedef u32 __ticketpair_t;
> >
> >  typedef struct arch_spinlock {
> >         union {
> > -               __ticketpair_t head_tail;
> > +               volatile __ticketpair_t head_tail;
> >                 struct __raw_tickets {
> > -                       __ticket_t head, tail;
> > +                       volatile __ticket_t head, tail;
> >                 } tickets;
> >         };
> >  } arch_spinlock_t;
> >
> >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
       [not found]   ` <CA+55aFxGCxQNE4HzRP_Uk2FmFDn_+Hfm6GBz75S+5+SDeODVJQ@mail.gmail.com>
@ 2014-12-04  7:02     ` Paul E. McKenney
  2014-12-04 18:02       ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04  7:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-arch, Davidlohr Bueso, dvyukov, peterz, linux-kernel,
	Ingo Molnar

On Wed, Dec 03, 2014 at 10:40:45PM -0800, Linus Torvalds wrote:
> On Dec 3, 2014 10:31 PM, "Linus Torvalds" <torvalds@linux-foundation.org>
> wrote:
> >
> > So no, no, no. C got this wrong. Volatile data structures are a
> fundamental mistake and a bug.
> 
> BTW, I'm not at all interested in language lawyering and people who say
> "but but we can do x". A compiler that modifies adjacent fields because the
> standard leaves is open is a crap compiler, and we won't use it, or disable
> the broken optimization. It is wrong from a concurrency standpoint anyway,
> and adding broken volatiles is just making things worse.

Understood, for example, adjacent fields protected by different locks
as one example, where adjacent-field overwriting completely breaks even
very conservatively designed code.  Should be entertaining!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04  7:02     ` Paul E. McKenney
@ 2014-12-04 18:02       ` Linus Torvalds
  2014-12-04 18:02         ` Linus Torvalds
  2014-12-04 18:36         ` Paul E. McKenney
  0 siblings, 2 replies; 11+ messages in thread
From: Linus Torvalds @ 2014-12-04 18:02 UTC (permalink / raw)
  To: Paul McKenney
  Cc: linux-arch@vger.kernel.org, Davidlohr Bueso, Dmitry Vyukov,
	Peter Zijlstra, Linux Kernel Mailing List, Ingo Molnar

On Wed, Dec 3, 2014 at 11:02 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Wed, Dec 03, 2014 at 10:40:45PM -0800, Linus Torvalds wrote:
>> On Dec 3, 2014 10:31 PM, "Linus Torvalds" <torvalds@linux-foundation.org>
>> wrote:
>> >
>> > So no, no, no. C got this wrong. Volatile data structures are a
>> fundamental mistake and a bug.
>>
>> BTW, I'm not at all interested in language lawyering and people who say
>> "but but we can do x". A compiler that modifies adjacent fields because the
>> standard leaves is open is a crap compiler, and we won't use it, or disable
>> the broken optimization. It is wrong from a concurrency standpoint anyway,
>> and adding broken volatiles is just making things worse.
>
> Understood, for example, adjacent fields protected by different locks
> as one example, where adjacent-field overwriting completely breaks even
> very conservatively designed code.

Exactly. Compilers that "optimize" things to touch fields that aren't
touched by the source code are simply inherently buggy shit. I'm not
at all interested in catering to their insanity.

It doesn't matter one whit if they can point to the legacy C "virtual
machine" definition and say that those accesses are invisible in the
virtual machine. They are not invisible in real life, and it is
entirely possible that two adjacent variables or fields are protected
by different locks - even in non-kernel code. Claiming that they need
to be marked volatile is a symptom of a diseased compiler writer.

Now, the one exception to this is generally bitfields, because there
the programmer knowingly and intentionally puts the fields together in
the same storage unit. I also think that volatile bitfields are an
insane concept, even if I think that the standard allows them. So I am
not saying that compilers should try to magically make bitfield
members not access the members around them.

I also accept that some architectures are broken. Old
non-byte/word-access alpha being the really canonical example. It's
not the compilers fault if the architecture is broken, and the
compiler cannot magically fix it.

But compilers that think that "hey, vectorization is cool, and I can
do load-stores and mask things dynamically" are misguided crap. It may
be fancy, it may be really cool compiler technology, but it's
fundamentally wrong unless the programmer told it was safe some way
(be it with a "pragma" or "restrict" or a compile-time switch or
whatever).

                            Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 18:02       ` Linus Torvalds
@ 2014-12-04 18:02         ` Linus Torvalds
  2014-12-04 18:36         ` Paul E. McKenney
  1 sibling, 0 replies; 11+ messages in thread
From: Linus Torvalds @ 2014-12-04 18:02 UTC (permalink / raw)
  To: Paul McKenney
  Cc: linux-arch@vger.kernel.org, Davidlohr Bueso, Dmitry Vyukov,
	Peter Zijlstra, Linux Kernel Mailing List, Ingo Molnar

On Wed, Dec 3, 2014 at 11:02 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Wed, Dec 03, 2014 at 10:40:45PM -0800, Linus Torvalds wrote:
>> On Dec 3, 2014 10:31 PM, "Linus Torvalds" <torvalds@linux-foundation.org>
>> wrote:
>> >
>> > So no, no, no. C got this wrong. Volatile data structures are a
>> fundamental mistake and a bug.
>>
>> BTW, I'm not at all interested in language lawyering and people who say
>> "but but we can do x". A compiler that modifies adjacent fields because the
>> standard leaves is open is a crap compiler, and we won't use it, or disable
>> the broken optimization. It is wrong from a concurrency standpoint anyway,
>> and adding broken volatiles is just making things worse.
>
> Understood, for example, adjacent fields protected by different locks
> as one example, where adjacent-field overwriting completely breaks even
> very conservatively designed code.

Exactly. Compilers that "optimize" things to touch fields that aren't
touched by the source code are simply inherently buggy shit. I'm not
at all interested in catering to their insanity.

It doesn't matter one whit if they can point to the legacy C "virtual
machine" definition and say that those accesses are invisible in the
virtual machine. They are not invisible in real life, and it is
entirely possible that two adjacent variables or fields are protected
by different locks - even in non-kernel code. Claiming that they need
to be marked volatile is a symptom of a diseased compiler writer.

Now, the one exception to this is generally bitfields, because there
the programmer knowingly and intentionally puts the fields together in
the same storage unit. I also think that volatile bitfields are an
insane concept, even if I think that the standard allows them. So I am
not saying that compilers should try to magically make bitfield
members not access the members around them.

I also accept that some architectures are broken. Old
non-byte/word-access alpha being the really canonical example. It's
not the compilers fault if the architecture is broken, and the
compiler cannot magically fix it.

But compilers that think that "hey, vectorization is cool, and I can
do load-stores and mask things dynamically" are misguided crap. It may
be fancy, it may be really cool compiler technology, but it's
fundamentally wrong unless the programmer told it was safe some way
(be it with a "pragma" or "restrict" or a compile-time switch or
whatever).

                            Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 18:02       ` Linus Torvalds
  2014-12-04 18:02         ` Linus Torvalds
@ 2014-12-04 18:36         ` Paul E. McKenney
  2014-12-04 19:18           ` Linus Torvalds
  2014-12-04 21:45           ` One Thousand Gnomes
  1 sibling, 2 replies; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04 18:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-arch@vger.kernel.org, Davidlohr Bueso, Dmitry Vyukov,
	Peter Zijlstra, Linux Kernel Mailing List, Ingo Molnar

On Thu, Dec 04, 2014 at 10:02:14AM -0800, Linus Torvalds wrote:
> On Wed, Dec 3, 2014 at 11:02 PM, Paul E. McKenney
> <paulmck@linux.vnet.ibm.com> wrote:
> > On Wed, Dec 03, 2014 at 10:40:45PM -0800, Linus Torvalds wrote:
> >> On Dec 3, 2014 10:31 PM, "Linus Torvalds" <torvalds@linux-foundation.org>
> >> wrote:
> >> >
> >> > So no, no, no. C got this wrong. Volatile data structures are a
> >> fundamental mistake and a bug.
> >>
> >> BTW, I'm not at all interested in language lawyering and people who say
> >> "but but we can do x". A compiler that modifies adjacent fields because the
> >> standard leaves is open is a crap compiler, and we won't use it, or disable
> >> the broken optimization. It is wrong from a concurrency standpoint anyway,
> >> and adding broken volatiles is just making things worse.
> >
> > Understood, for example, adjacent fields protected by different locks
> > as one example, where adjacent-field overwriting completely breaks even
> > very conservatively designed code.
> 
> Exactly. Compilers that "optimize" things to touch fields that aren't
> touched by the source code are simply inherently buggy shit. I'm not
> at all interested in catering to their insanity.
> 
> It doesn't matter one whit if they can point to the legacy C "virtual
> machine" definition and say that those accesses are invisible in the
> virtual machine. They are not invisible in real life, and it is
> entirely possible that two adjacent variables or fields are protected
> by different locks - even in non-kernel code. Claiming that they need
> to be marked volatile is a symptom of a diseased compiler writer.
> 
> Now, the one exception to this is generally bitfields, because there
> the programmer knowingly and intentionally puts the fields together in
> the same storage unit. I also think that volatile bitfields are an
> insane concept, even if I think that the standard allows them. So I am
> not saying that compilers should try to magically make bitfield
> members not access the members around them.
> 
> I also accept that some architectures are broken. Old
> non-byte/word-access alpha being the really canonical example. It's
> not the compilers fault if the architecture is broken, and the
> compiler cannot magically fix it.

I have to ask...  Does this mean we can remove the current
restrictions against 8-bit and 16-bit access from smp_load_acquire()
and smp_store_release()?

> But compilers that think that "hey, vectorization is cool, and I can
> do load-stores and mask things dynamically" are misguided crap. It may
> be fancy, it may be really cool compiler technology, but it's
> fundamentally wrong unless the programmer told it was safe some way
> (be it with a "pragma" or "restrict" or a compile-time switch or
> whatever).

You might be happy to hear that I just sent an email to the C++ standards
committee noting that valid C11 compilers are not supposed to be able to
introduce data races [1].  I further argued that any store-widening access
affecting any non-private adjacent variable must be assumed to (illegally
per the C11 standard) introduce a data race, even if there are no locks,
atomic accesses, transactions, or any other synchronization mechanism
anywhere in that translation unit.  After all, any non-static function
in that translation unit might be called from some other translation
unit that -did- use locking or whatever.

I will let you know how it goes.  ;-)

							Thanx, Paul

[1]	A "data race" occurs in any C11 program where multiple threads
	might be accessing a non-atomic variable, and where at least
	one of the accesses is a write.  C11 states that data races
	result in undefined behavior.  Therefore, if the source code
	does not contain a data race, the object code had also better
	be free of data races.  Otherwise, the compiler inflicted
	undefined behavior on a perfectly legitimate program.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 18:36         ` Paul E. McKenney
@ 2014-12-04 19:18           ` Linus Torvalds
  2014-12-04 20:00             ` Paul E. McKenney
  2014-12-04 21:45           ` One Thousand Gnomes
  1 sibling, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2014-12-04 19:18 UTC (permalink / raw)
  To: Paul McKenney
  Cc: linux-arch@vger.kernel.org, Davidlohr Bueso, Dmitry Vyukov,
	Peter Zijlstra, Linux Kernel Mailing List, Ingo Molnar

On Thu, Dec 4, 2014 at 10:36 AM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
>
> I have to ask...  Does this mean we can remove the current
> restrictions against 8-bit and 16-bit access from smp_load_acquire()
> and smp_store_release()?

I'd certainly be ok with it. Alpha doesn't have acquire/release
semantics anyway (really, it's the worst memory ordering model
*ever*), so those will end up being just plain (access-once) loads and
stores, followed/preceded by a memory barrier anyway. So it arguably
is no worse than the existing situation with ACCESS_ONCE() on alpha.

And quite frankly, I simply don't think that an old broken alpha
architecture should be something we worry about. Remember: the byte
and word ops were introduced in 21164, and released in 1996, so it's
not even like "alpha has broken behavior". It's literally just "the
very earliest alphas were broken", and I suspect most of those
machines (at least running Linux) weren't even SMP (ie somebody may
still have a Multia around for sentimental reasons, but SMP? No).

Of course, I'd like there to be a real reason to do so, not just "who
cares about really old alphas, nyaah, nyaah, nyaah"? But if there is a
clear case where a byte load-acquire and store-release would improve
on something important, then yes, I think we should do it.

We dropped support for the original i386, we can drop support for old
broken alphas. People running them for sentimental reasons might as
well be sentimental about software too, and run old kernels ;)

                    Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 19:18           ` Linus Torvalds
@ 2014-12-04 20:00             ` Paul E. McKenney
  2014-12-04 20:12               ` Paul E. McKenney
  0 siblings, 1 reply; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04 20:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-arch@vger.kernel.org, Davidlohr Bueso, Dmitry Vyukov,
	Peter Zijlstra, Linux Kernel Mailing List, Ingo Molnar

On Thu, Dec 04, 2014 at 11:18:17AM -0800, Linus Torvalds wrote:
> On Thu, Dec 4, 2014 at 10:36 AM, Paul E. McKenney
> <paulmck@linux.vnet.ibm.com> wrote:
> >
> > I have to ask...  Does this mean we can remove the current
> > restrictions against 8-bit and 16-bit access from smp_load_acquire()
> > and smp_store_release()?
> 
> I'd certainly be ok with it. Alpha doesn't have acquire/release
> semantics anyway (really, it's the worst memory ordering model
> *ever*), so those will end up being just plain (access-once) loads and
> stores, followed/preceded by a memory barrier anyway. So it arguably
> is no worse than the existing situation with ACCESS_ONCE() on alpha.
> 
> And quite frankly, I simply don't think that an old broken alpha
> architecture should be something we worry about. Remember: the byte
> and word ops were introduced in 21164, and released in 1996, so it's
> not even like "alpha has broken behavior". It's literally just "the
> very earliest alphas were broken", and I suspect most of those
> machines (at least running Linux) weren't even SMP (ie somebody may
> still have a Multia around for sentimental reasons, but SMP? No).
> 
> Of course, I'd like there to be a real reason to do so, not just "who
> cares about really old alphas, nyaah, nyaah, nyaah"? But if there is a
> clear case where a byte load-acquire and store-release would improve
> on something important, then yes, I think we should do it.
> 
> We dropped support for the original i386, we can drop support for old
> broken alphas. People running them for sentimental reasons might as
> well be sentimental about software too, and run old kernels ;)

Sad to say, I cannot argue that the places where I have hit this thus far
have been anything more than irritations, although they have sometimes
been quite irritating.  One is in rcutorture, but memory size is not a
concern there.  Another is in a field in a struct that also contains an
rcu_head structure, so shrinking it below an int doesn't actually save
any space.  A third one is a single global variable used to track sysidle
state, but it is just a single global variable that is not in TINY_RCU,
so who cares?  If I find some place where this is either increasing
the size of a data structure that can have lots of instances or where
it is preventing an explicit memory barrier from being pulled into an
smp_load_acquire() or smp_store_release(), I will revisit.

And it turns out that there really is a prohibition against clobbering
other-thread-accessible adjacent fields and variables in the C11 standard,
section 3.14p2:

	NOTE 1 Two threads of execution can update and access separate
	memory locations without interfering with each other.

In the C++11 standard, 1.7p3 has a similar prohibition, as Ville
Voutilainen pointed out in response to my email:

	Two or more threads of execution (1.10) can update and access
	separate memory locations without interfering with each other.

I would have expected this to be in 1.10 rather than 1.7, but as long
as it is in there somewhere.  ;-)

In both standards, "separate memory locations" mean "having no bytes
in common".  This confirms your intuition that concurrent access to
bitfields cannnot always be trusted.  Yes, if the two bitfields don't
have any bits living in the same byte, we can in theory have two threads
updating them conconcurrently, but that would be a pain to verify.

So any compiler that clobbers some adjacent non-bitfield variable or
field that is accessible by other threads is not just despicable, it
fails to conform to the standard.

Whew!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 20:00             ` Paul E. McKenney
@ 2014-12-04 20:12               ` Paul E. McKenney
  0 siblings, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04 20:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-arch@vger.kernel.org, Davidlohr Bueso, Dmitry Vyukov,
	Peter Zijlstra, Linux Kernel Mailing List, Ingo Molnar

On Thu, Dec 04, 2014 at 12:00:52PM -0800, Paul E. McKenney wrote:

[ . . . ]

> So any compiler that clobbers some adjacent non-bitfield variable or
> field that is accessible by other threads is not just despicable, it
> fails to conform to the standard.
> 
> Whew!  ;-)

And part of the reason for my confusion is that I am using an old version
of gcc, 4.6.3.  Apparently this aspect of gcc wasn't fixed until 4.7
or thereabouts.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 18:36         ` Paul E. McKenney
  2014-12-04 19:18           ` Linus Torvalds
@ 2014-12-04 21:45           ` One Thousand Gnomes
  2014-12-04 22:06             ` Paul E. McKenney
  1 sibling, 1 reply; 11+ messages in thread
From: One Thousand Gnomes @ 2014-12-04 21:45 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, linux-arch@vger.kernel.org, Davidlohr Bueso,
	Dmitry Vyukov, Peter Zijlstra, Linux Kernel Mailing List,
	Ingo Molnar

> anywhere in that translation unit.  After all, any non-static function
> in that translation unit might be called from some other translation
> unit that -did- use locking or whatever.
> 
> I will let you know how it goes.  ;-)

It breaks DEC10 ;-)

If there is kickback over things like optimisation perhaps the gcc
maintainers could at least consider something like

	int __attribute((threadsafe)) fred;

??

Alan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] locking: Add volatile to arch_spinlock_t structures
  2014-12-04 21:45           ` One Thousand Gnomes
@ 2014-12-04 22:06             ` Paul E. McKenney
  0 siblings, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2014-12-04 22:06 UTC (permalink / raw)
  To: One Thousand Gnomes
  Cc: Linus Torvalds, linux-arch@vger.kernel.org, Davidlohr Bueso,
	Dmitry Vyukov, Peter Zijlstra, Linux Kernel Mailing List,
	Ingo Molnar

On Thu, Dec 04, 2014 at 09:45:46PM +0000, One Thousand Gnomes wrote:
> > anywhere in that translation unit.  After all, any non-static function
> > in that translation unit might be called from some other translation
> > unit that -did- use locking or whatever.
> > 
> > I will let you know how it goes.  ;-)
> 
> It breaks DEC10 ;-)

To say nothing of CDC 6600 systems lacking the compare-move unit.  ;-)

> If there is kickback over things like optimisation perhaps the gcc
> maintainers could at least consider something like
> 
> 	int __attribute((threadsafe)) fred;
> 
> ??

Not needed for recent gcc versions -- both the C11 and C++11 standards
require that different threads be permitted to independently access
different variables, where "different" means "no bits of the two variables
residing in the same byte".  From what I can see, this would mean that
a conforming pre-EV56 Alpha C11 compiler would need to use LDL_L and
STL_C to carry out 8-bit and 16-bit stores.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2014-12-04 22:08 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-04  6:20 [PATCH RFC] locking: Add volatile to arch_spinlock_t structures Paul E. McKenney
     [not found] ` <CA+55aFzn-6asfWHB8SAvz4GJWz7uEriujgVLfaqoo_VPtNBLuA@mail.gmail.com>
2014-12-04  6:57   ` Paul E. McKenney
     [not found]   ` <CA+55aFxGCxQNE4HzRP_Uk2FmFDn_+Hfm6GBz75S+5+SDeODVJQ@mail.gmail.com>
2014-12-04  7:02     ` Paul E. McKenney
2014-12-04 18:02       ` Linus Torvalds
2014-12-04 18:02         ` Linus Torvalds
2014-12-04 18:36         ` Paul E. McKenney
2014-12-04 19:18           ` Linus Torvalds
2014-12-04 20:00             ` Paul E. McKenney
2014-12-04 20:12               ` Paul E. McKenney
2014-12-04 21:45           ` One Thousand Gnomes
2014-12-04 22:06             ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox