linux-rt-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] seqlock: serialize against writers
@ 2008-08-29 15:44 Gregory Haskins
  2008-08-29 16:09 ` Andi Kleen
                   ` (5 more replies)
  0 siblings, 6 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 15:44 UTC (permalink / raw)
  To: mingo, rostedt, tglx; +Cc: linux-kernel, linux-rt-users, gregory.haskins

*Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*

Hi Ingo, Steven, Thomas,
  Please consider for -rt4.  This fixes a nasty deadlock on my systems under
  heavy load.

-Greg

----

Seqlocks have always advertised that readers do not "block", but this was
never really true.  Readers have always logically blocked at the head of
the critical section under contention with writers, regardless of whether
they were allowed to run code or not.

Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
have turned this into a more explicit blocking operation in mainline.
However, this change highlights a short-coming in -rt because the
normal seqlock_ts are preemptible.  This means that we can potentially
deadlock should a reader spin waiting for a write critical-section to end
while the writer is preempted.

This patch changes the internal implementation to use a rwlock and forces
the readers to serialize with the writers under contention.  This will
have the advantage that -rt seqlocks_t will sleep the reader if deadlock
were imminent, and it will pi-boost the writer to prevent inversion.

This fixes a deadlock discovered under testing where all high prioritiy
readers were hogging the cpus and preventing a writer from releasing the
lock.

Since seqlocks are designed to be used as rarely-write locks, this should
not affect the performance in the fast-path

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/seqlock.h |   76 +++++++++++++++++++++++++----------------------
 1 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 345d726..7010169 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,9 +3,11 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. Readers block
+ * on write contention (and where applicable, pi-boost the writer).
+ * Readers without contention on entry acquire the critical section
+ * without any atomic operations, but they may have to retry if a writer
+ * enters before the critical section ends. Writers do not wait for readers.
  *
  * This is not as cache friendly as brlock. Also, this will not work
  * for data that contains pointers, because any writer could
@@ -24,6 +26,8 @@
  *
  * Based on x86_64 vsyscall gettimeofday 
  * by Keith Owens and Andrea Arcangeli
+ *
+ * Priority inheritance and live-lock avoidance by Gregory Haskins
  */
 
 #include <linux/spinlock.h>
@@ -31,12 +35,12 @@
 
 typedef struct {
 	unsigned sequence;
-	spinlock_t lock;
+	rwlock_t lock;
 } __seqlock_t;
 
 typedef struct {
 	unsigned sequence;
-	raw_spinlock_t lock;
+	raw_rwlock_t lock;
 } __raw_seqlock_t;
 
 #define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
@@ -54,10 +58,10 @@ typedef __raw_seqlock_t raw_seqlock_t;
  * OK now.  Be cautious.
  */
 #define __RAW_SEQLOCK_UNLOCKED(lockname) \
-		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
+		{ 0, RAW_RW_LOCK_UNLOCKED(lockname) }
 
 #ifdef CONFIG_PREEMPT_RT
-# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) }
 #else
 # define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
 #endif
@@ -66,10 +70,10 @@ typedef __raw_seqlock_t raw_seqlock_t;
 		 __SEQLOCK_UNLOCKED(old_style_seqlock_init)
 
 #define raw_seqlock_init(x) \
-	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
+	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
 
 #define seqlock_init(x) \
-		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
+		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
 
 #define DEFINE_SEQLOCK(x) \
 		seqlock_t x = __SEQLOCK_UNLOCKED(x)
@@ -85,7 +89,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
  */
 static inline void __write_seqlock(seqlock_t *sl)
 {
-	spin_lock(&sl->lock);
+	write_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
@@ -103,14 +107,14 @@ static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
-	spin_unlock(&sl->lock);
+	write_unlock(&sl->lock);
 }
 
 #define __write_sequnlock_irqrestore(sl, flags)	__write_sequnlock(sl)
 
 static inline int __write_tryseqlock(seqlock_t *sl)
 {
-	int ret = spin_trylock(&sl->lock);
+	int ret = write_trylock(&sl->lock);
 
 	if (ret) {
 		++sl->sequence;
@@ -120,18 +124,25 @@ static inline int __write_tryseqlock(seqlock_t *sl)
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin(seqlock_t *sl)
 {
 	unsigned ret;
 
-repeat:
 	ret = sl->sequence;
 	smp_rmb();
 	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
+		/*
+		 * Serialze with the writer which will ensure they are
+		 * pi-boosted if necessary and prevent us from starving
+		 * them.
+		 */
+		read_lock(&sl->lock);
+		ret = sl->sequence;
+		read_unlock(&sl->lock);
 	}
 
+	BUG_ON(ret & 1);
+
 	return ret;
 }
 
@@ -142,25 +153,13 @@ repeat:
  */
 static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
 {
-	int ret;
-
 	smp_rmb();
-	ret = (sl->sequence != iv);
-	/*
-	 * If invalid then serialize with the writer, to make sure we
-	 * are not livelocking it:
-	 */
-	if (unlikely(ret)) {
-		unsigned long flags;
-		spin_lock_irqsave(&sl->lock, flags);
-		spin_unlock_irqrestore(&sl->lock, flags);
-	}
-	return ret;
+	return (sl->sequence != iv);
 }
 
 static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl)
 {
-	spin_lock(&sl->lock);
+	write_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
@@ -191,7 +190,7 @@ static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
-	spin_unlock(&sl->lock);
+	write_unlock(&sl->lock);
 }
 
 static __always_inline void
@@ -217,7 +216,7 @@ static __always_inline void __write_sequnlock_bh_raw(raw_seqlock_t *sl)
 
 static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
 {
-	int ret = spin_trylock(&sl->lock);
+	int ret = write_trylock(&sl->lock);
 
 	if (ret) {
 		++sl->sequence;
@@ -226,18 +225,23 @@ static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
 	return ret;
 }
 
-static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin_raw(raw_seqlock_t *sl)
 {
 	unsigned ret;
 
-repeat:
 	ret = sl->sequence;
 	smp_rmb();
 	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
+		/*
+		 * Serialize with the writer
+		 */
+		read_lock(&sl->lock);
+		ret = sl->sequence;
+		read_unlock(&sl->lock);
 	}
 
+	BUG_ON(ret & 1);
+
 	return ret;
 }
 


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
@ 2008-08-29 16:09 ` Andi Kleen
  2008-08-29 16:10   ` Gregory Haskins
  2008-08-29 16:57 ` Stephen Hemminger
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 28+ messages in thread
From: Andi Kleen @ 2008-08-29 16:09 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins

Gregory Haskins <ghaskins@novell.com> writes:

> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
>
> Hi Ingo, Steven, Thomas,
>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>   heavy load.

Does this even work under x86-64? x86-64 uses seqlocks in user space
in its vsyscalls. And read_lock() definitely doesn't work there because 
it writes.

You would need at least to disable vsyscall gettimeofday(), making
it much much slower.

Perhaps you tested on one of the systems where the vsyscalls need
to fallback for other reasons? (e.g. one using pmtimer for timing).

-Andi

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:09 ` Andi Kleen
@ 2008-08-29 16:10   ` Gregory Haskins
  2008-08-29 16:22     ` Andi Kleen
  0 siblings, 1 reply; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 16:10 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 850 bytes --]

Andi Kleen wrote:
> Gregory Haskins <ghaskins@novell.com> writes:
>
>   
>> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
>>
>> Hi Ingo, Steven, Thomas,
>>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>>   heavy load.
>>     
>
> Does this even work under x86-64? x86-64 uses seqlocks in user space
> in its vsyscalls. And read_lock() definitely doesn't work there because 
> it writes.
>
> You would need at least to disable vsyscall gettimeofday(), making
> it much much slower.
>
> Perhaps you tested on one of the systems where the vsyscalls need
> to fallback for other reasons? (e.g. one using pmtimer for timing).
>
> -Andi
>   

Im running it on a x86_64 box as we speak.  How can I tell if there is a
certain mode that is permitting this?

-Greg


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:10   ` Gregory Haskins
@ 2008-08-29 16:22     ` Andi Kleen
  2008-08-29 16:26       ` Gregory Haskins
  2008-08-29 16:29       ` Gregory Haskins
  0 siblings, 2 replies; 28+ messages in thread
From: Andi Kleen @ 2008-08-29 16:22 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Andi Kleen, Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users

> Im running it on a x86_64 box as we speak.  How can I tell if there is a
> certain mode that is permitting this?

If the boot up says you're running with PMtimer then it uses the fallback
(usually happens on pre Fam10h AMD boxes). A typical Intel box
would use the faster ring 3 only TSC path and then explode with your
change I bet. 

Or step with gdb through gettimeofday() and see if it does a syscall.

-Andi

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:22     ` Andi Kleen
@ 2008-08-29 16:26       ` Gregory Haskins
  2008-08-29 16:34         ` Steven Rostedt
  2008-08-29 16:29       ` Gregory Haskins
  1 sibling, 1 reply; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 16:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 4149 bytes --]

Andi Kleen wrote:
>> Im running it on a x86_64 box as we speak.  How can I tell if there is a
>> certain mode that is permitting this?
>>     
>
> If the boot up says you're running with PMtimer then it uses the fallback
> (usually happens on pre Fam10h AMD boxes). A typical Intel box
> would use the faster ring 3 only TSC path and then explode with your
> change I bet. 
>
> Or step with gdb through gettimeofday() and see if it does a syscall.
>
> -Andi
>   

It seems to be running fine with no indication it has fallen back. 
Perhaps I need a certain workload to bring out the issue?

Here are some details of my system:

-------------------

ghaskins@test:~> uname -a
Linux test 2.6.26.3-rt3-rt #2 SMP PREEMPT RT Fri Aug 29 10:47:17 EDT
2008 x86_64 x86_64 x86_64 GNU/Linux
ghaskins@test:~> cat /proc/cpuinfo
processor    : 0
vendor_id    : GenuineIntel
cpu family    : 6
model        : 15
model name    : Intel(R) Xeon(R) CPU            5130  @ 2.00GHz
stepping    : 6
cpu MHz        : 1995.006
cache size    : 4096 KB
physical id    : 0
siblings    : 2
core id        : 0
cpu cores    : 2
apicid        : 0
initial apicid    : 0
fpu        : yes
fpu_exception    : yes
cpuid level    : 10
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall
lm constant_tsc arch_perfmon pebs bts rep_good pni monitor ds_cpl vmx
tm2 ssse3 cx16 xtpr dca lahf_lm
bogomips    : 3992.49
clflush size    : 64
cache_alignment    : 64
address sizes    : 36 bits physical, 48 bits virtual
power management:

processor    : 1
vendor_id    : GenuineIntel
cpu family    : 6
model        : 15
model name    : Intel(R) Xeon(R) CPU            5130  @ 2.00GHz
stepping    : 6
cpu MHz        : 1995.006
cache size    : 4096 KB
physical id    : 3
siblings    : 2
core id        : 0
cpu cores    : 2
apicid        : 6
initial apicid    : 6
fpu        : yes
fpu_exception    : yes
cpuid level    : 10
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall
lm constant_tsc arch_perfmon pebs bts rep_good pni monitor ds_cpl vmx
tm2 ssse3 cx16 xtpr dca lahf_lm
bogomips    : 3990.05
clflush size    : 64
cache_alignment    : 64
address sizes    : 36 bits physical, 48 bits virtual
power management:

processor    : 2
vendor_id    : GenuineIntel
cpu family    : 6
model        : 15
model name    : Intel(R) Xeon(R) CPU            5130  @ 2.00GHz
stepping    : 6
cpu MHz        : 1995.006
cache size    : 4096 KB
physical id    : 0
siblings    : 2
core id        : 1
cpu cores    : 2
apicid        : 1
initial apicid    : 1
fpu        : yes
fpu_exception    : yes
cpuid level    : 10
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall
lm constant_tsc arch_perfmon pebs bts rep_good pni monitor ds_cpl vmx
tm2 ssse3 cx16 xtpr dca lahf_lm
bogomips    : 3990.08
clflush size    : 64
cache_alignment    : 64
address sizes    : 36 bits physical, 48 bits virtual
power management:

processor    : 3
vendor_id    : GenuineIntel
cpu family    : 6
model        : 15
model name    : Intel(R) Xeon(R) CPU            5130  @ 2.00GHz
stepping    : 6
cpu MHz        : 1995.006
cache size    : 4096 KB
physical id    : 3
siblings    : 2
core id        : 1
cpu cores    : 2
apicid        : 7
initial apicid    : 7
fpu        : yes
fpu_exception    : yes
cpuid level    : 10
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall
lm constant_tsc arch_perfmon pebs bts rep_good pni monitor ds_cpl vmx
tm2 ssse3 cx16 xtpr dca lahf_lm
bogomips    : 3990.04
clflush size    : 64
cache_alignment    : 64
address sizes    : 36 bits physical, 48 bits virtual
power management:

ghaskins@test:~> dmesg | grep -i "pmtimer"
ghaskins@test:~>




[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:22     ` Andi Kleen
  2008-08-29 16:26       ` Gregory Haskins
@ 2008-08-29 16:29       ` Gregory Haskins
  2008-08-29 16:37         ` Andi Kleen
  1 sibling, 1 reply; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 16:29 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 507 bytes --]

Andi Kleen wrote:
>> Im running it on a x86_64 box as we speak.  How can I tell if there is a
>> certain mode that is permitting this?
>>     
>
> If the boot up says you're running with PMtimer then it uses the fallback
> (usually happens on pre Fam10h AMD boxes). A typical Intel box
> would use the faster ring 3 only TSC path and then explode with your
> change I bet. 
>   

Thinking about this some more, perhaps the issue is I am not hitting the
contended path in vsyscall?

-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:26       ` Gregory Haskins
@ 2008-08-29 16:34         ` Steven Rostedt
  2008-08-29 16:35           ` Gregory Haskins
  0 siblings, 1 reply; 28+ messages in thread
From: Steven Rostedt @ 2008-08-29 16:34 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Andi Kleen, Gregory Haskins, mingo, tglx, linux-kernel,
	linux-rt-users



On Fri, 29 Aug 2008, Gregory Haskins wrote:

> Andi Kleen wrote:
> >> Im running it on a x86_64 box as we speak.  How can I tell if there is a
> >> certain mode that is permitting this?
> >>     
> >
> > If the boot up says you're running with PMtimer then it uses the fallback
> > (usually happens on pre Fam10h AMD boxes). A typical Intel box
> > would use the faster ring 3 only TSC path and then explode with your
> > change I bet. 
> >
> > Or step with gdb through gettimeofday() and see if it does a syscall.
> >
> > -Andi
> >   
> 
> It seems to be running fine with no indication it has fallen back. 
> Perhaps I need a certain workload to bring out the issue?

Perhaps you never hit the slow path in userland. That's the only place it 
would write. Perhaps add a dummy static variable in the fast path, and 
write to it. See if that crashes you apps.

-- Steve

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:34         ` Steven Rostedt
@ 2008-08-29 16:35           ` Gregory Haskins
  2008-08-29 16:45             ` Andi Kleen
  2008-08-29 16:58             ` Steven Rostedt
  0 siblings, 2 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 16:35 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andi Kleen, Gregory Haskins, mingo, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 1633 bytes --]

Steven Rostedt wrote:
> On Fri, 29 Aug 2008, Gregory Haskins wrote:
>
>   
>> Andi Kleen wrote:
>>     
>>>> Im running it on a x86_64 box as we speak.  How can I tell if there is a
>>>> certain mode that is permitting this?
>>>>     
>>>>         
>>> If the boot up says you're running with PMtimer then it uses the fallback
>>> (usually happens on pre Fam10h AMD boxes). A typical Intel box
>>> would use the faster ring 3 only TSC path and then explode with your
>>> change I bet. 
>>>
>>> Or step with gdb through gettimeofday() and see if it does a syscall.
>>>
>>> -Andi
>>>   
>>>       
>> It seems to be running fine with no indication it has fallen back. 
>> Perhaps I need a certain workload to bring out the issue?
>>     
>
> Perhaps you never hit the slow path in userland. That's the only place it 
> would write. Perhaps add a dummy static variable in the fast path, and 
> write to it. See if that crashes you apps.
>
> -- Steve
>   

Yeah, ideas crossed in the mail ;)

I could just force all of the seqbegins to hit the slowpath by hacking
the code and see what happens (aside from slowing down, of course ;)

Question: Which seqlock_t does userspace use?  I assume it uses
seqlock_t and not raw_seqlock_t.  But the only reason that I ask is that
I converted raw_seqlock_t to use the new style as well to be consistent,
even though it is not strictly necessary for the same reasons.  So if
perchance userspace uses the raw variant, I could solve this issue by
only re-working the seqlock_t variant.  Kind of a long shot, but figured
I would mention it :)

-Greg




[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:29       ` Gregory Haskins
@ 2008-08-29 16:37         ` Andi Kleen
  2008-08-29 16:41           ` Gregory Haskins
  0 siblings, 1 reply; 28+ messages in thread
From: Andi Kleen @ 2008-08-29 16:37 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Andi Kleen, Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users

On Fri, Aug 29, 2008 at 12:29:42PM -0400, Gregory Haskins wrote:
> Andi Kleen wrote:
> >> Im running it on a x86_64 box as we speak.  How can I tell if there is a
> >> certain mode that is permitting this?
> >>     
> >
> > If the boot up says you're running with PMtimer then it uses the fallback
> > (usually happens on pre Fam10h AMD boxes). A typical Intel box
> > would use the faster ring 3 only TSC path and then explode with your
> > change I bet. 
> >   
> 
> Thinking about this some more, perhaps the issue is I am not hitting the
> contended path in vsyscall?

Yes it will be only contended when gettimeofday() races with the timer 
interrupt.  You could try to run gettimeofday() in a loop and see how
long it holds up.

But anyways from the theory you should crash when it happens. 
Writes to kernel data are not allowed in vsyscalls and your read_lock clearly 
does a write.

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:37         ` Andi Kleen
@ 2008-08-29 16:41           ` Gregory Haskins
  2008-08-29 17:08             ` Andi Kleen
  0 siblings, 1 reply; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 16:41 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 1842 bytes --]

Andi Kleen wrote:
> On Fri, Aug 29, 2008 at 12:29:42PM -0400, Gregory Haskins wrote:
>   
>> Andi Kleen wrote:
>>     
>>>> Im running it on a x86_64 box as we speak.  How can I tell if there is a
>>>> certain mode that is permitting this?
>>>>     
>>>>         
>>> If the boot up says you're running with PMtimer then it uses the fallback
>>> (usually happens on pre Fam10h AMD boxes). A typical Intel box
>>> would use the faster ring 3 only TSC path and then explode with your
>>> change I bet. 
>>>   
>>>       
>> Thinking about this some more, perhaps the issue is I am not hitting the
>> contended path in vsyscall?
>>     
>
> Yes it will be only contended when gettimeofday() races with the timer 
> interrupt.  You could try to run gettimeofday() in a loop and see how
> long it holds up.
>
> But anyways from the theory you should crash when it happens. 
> Writes to kernel data are not allowed in vsyscalls and your read_lock clearly 
> does a write.
>   

Oh I don't deny that it does.  The compiler neatly reminded me that it
could no longer be "const".  I just was ignorant of the userspace
requirement ;)

But we *do* have a serious problem here.  A non-preemptible seqlock_t
will do bad things if it preempts the writer, so we need some kind of
solution here, one way or the other.  So suggestions welcome :)  I
realize this is only an issue currently in PREEMPT_RT so perhaps most
will not care... but I do need to solve this at least for this branch.

I currently do not see a way to solve this problem that doesn't involve
some heavier involvement in the read path (read: const seqlock_t need
not apply).  Given that, we either need to address the const requirement
for userspace, alter userspace's usage, or find another way to address
the deadlock.  Any ideas?

-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:35           ` Gregory Haskins
@ 2008-08-29 16:45             ` Andi Kleen
  2008-08-29 16:53               ` Steven Rostedt
  2008-08-29 17:00               ` Gregory Haskins
  2008-08-29 16:58             ` Steven Rostedt
  1 sibling, 2 replies; 28+ messages in thread
From: Andi Kleen @ 2008-08-29 16:45 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Steven Rostedt, Andi Kleen, Gregory Haskins, mingo, tglx,
	linux-kernel, linux-rt-users

> I could just force all of the seqbegins to hit the slowpath by hacking
> the code and see what happens (aside from slowing down, of course ;)

Only if you don't believe it will really crash? I think it's pretty
clear even without trying it.

> Question: Which seqlock_t does userspace use?  I assume it uses
> seqlock_t and not raw_seqlock_t. 

> But the only reason that I ask is that
> I converted raw_seqlock_t to use the new style as well to be consistent,

There's no raw_seqlock_t anywhere in mainline?

Anyways the variable is declared (in mainline) in asm-x86/vgtod.h 

> even though it is not strictly necessary for the same reasons.  So if
> perchance userspace uses the raw variant, I could solve this issue by
> only re-working the seqlock_t variant.  Kind of a long shot, but figured
> I would mention it :)

I guess you could define a new seqlock_t which is explicitely user space
safe. That might avoid such issues in the future. But then
that would likely require some code duplication and be ugly.

On the other hand whatever problem you fixing in the kernel
(to be honest it's still unclear to me what the problem is)
needs to be likely fixed for the userland lock too.

-Andi


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:45             ` Andi Kleen
@ 2008-08-29 16:53               ` Steven Rostedt
  2008-08-29 17:00                 ` Gregory Haskins
  2008-08-29 17:00               ` Gregory Haskins
  1 sibling, 1 reply; 28+ messages in thread
From: Steven Rostedt @ 2008-08-29 16:53 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gregory Haskins, Gregory Haskins, mingo, tglx, linux-kernel,
	linux-rt-users


The subject forgot to add "RT" in the brackets.

On Fri, 29 Aug 2008, Andi Kleen wrote:

> > I could just force all of the seqbegins to hit the slowpath by hacking
> > the code and see what happens (aside from slowing down, of course ;)
> 
> Only if you don't believe it will really crash? I think it's pretty
> clear even without trying it.
> 
> > Question: Which seqlock_t does userspace use?  I assume it uses
> > seqlock_t and not raw_seqlock_t. 
> 
> > But the only reason that I ask is that
> > I converted raw_seqlock_t to use the new style as well to be consistent,
> 
> There's no raw_seqlock_t anywhere in mainline?

Nope, raw_seqlock_t in -rt is equivelant to seqlock_t in mainline.

> 
> Anyways the variable is declared (in mainline) in asm-x86/vgtod.h 
> 
> > even though it is not strictly necessary for the same reasons.  So if
> > perchance userspace uses the raw variant, I could solve this issue by
> > only re-working the seqlock_t variant.  Kind of a long shot, but figured
> > I would mention it :)
> 
> I guess you could define a new seqlock_t which is explicitely user space
> safe. That might avoid such issues in the future. But then
> that would likely require some code duplication and be ugly.
> 
> On the other hand whatever problem you fixing in the kernel
> (to be honest it's still unclear to me what the problem is)
> needs to be likely fixed for the userland lock too.

I'm not convinced that the raw_seqlocks (mainline normal seqlocks) has a 
problem anyway.

-- Steve


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
  2008-08-29 16:09 ` Andi Kleen
@ 2008-08-29 16:57 ` Stephen Hemminger
  2008-08-29 17:02   ` [ RT PATCH] " Steven Rostedt
  2008-08-29 18:03 ` [RT PATCH v2] " Gregory Haskins
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2008-08-29 16:57 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins

On Fri, 29 Aug 2008 11:44:58 -0400
Gregory Haskins <ghaskins@novell.com> wrote:

> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
> 
> Hi Ingo, Steven, Thomas,
>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>   heavy load.
> 
> -Greg
> 
> ----
> 
> Seqlocks have always advertised that readers do not "block", but this was
> never really true.  Readers have always logically blocked at the head of
> the critical section under contention with writers, regardless of whether
> they were allowed to run code or not.
> 
> Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
> have turned this into a more explicit blocking operation in mainline.
> However, this change highlights a short-coming in -rt because the
> normal seqlock_ts are preemptible.  This means that we can potentially
> deadlock should a reader spin waiting for a write critical-section to end
> while the writer is preempted.
> 
> This patch changes the internal implementation to use a rwlock and forces
> the readers to serialize with the writers under contention.  This will
> have the advantage that -rt seqlocks_t will sleep the reader if deadlock
> were imminent, and it will pi-boost the writer to prevent inversion.
> 
> This fixes a deadlock discovered under testing where all high prioritiy
> readers were hogging the cpus and preventing a writer from releasing the
> lock.
> 
> Since seqlocks are designed to be used as rarely-write locks, this should
> not affect the performance in the fast-path
> 
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
> 
>  include/linux/seqlock.h |   76 +++++++++++++++++++++++++----------------------
>  1 files changed, 40 insertions(+), 36 deletions(-)
> 
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index 345d726..7010169 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -3,9 +3,11 @@
>  /*
>   * Reader/writer consistent mechanism without starving writers. This type of
>   * lock for data where the reader wants a consistent set of information
> - * and is willing to retry if the information changes.  Readers never
> - * block but they may have to retry if a writer is in
> - * progress. Writers do not wait for readers. 
> + * and is willing to retry if the information changes. Readers block
> + * on write contention (and where applicable, pi-boost the writer).
> + * Readers without contention on entry acquire the critical section
> + * without any atomic operations, but they may have to retry if a writer
> + * enters before the critical section ends. Writers do not wait for readers.
>   *
>   * This is not as cache friendly as brlock. Also, this will not work
>   * for data that contains pointers, because any writer could
> @@ -24,6 +26,8 @@
>   *
>   * Based on x86_64 vsyscall gettimeofday 
>   * by Keith Owens and Andrea Arcangeli
> + *
> + * Priority inheritance and live-lock avoidance by Gregory Haskins
>   */
>  
>  #include <linux/spinlock.h>
> @@ -31,12 +35,12 @@
>  
>  typedef struct {
>  	unsigned sequence;
> -	spinlock_t lock;
> +	rwlock_t lock;
>  } __seqlock_t;
>  
>  typedef struct {
>  	unsigned sequence;
> -	raw_spinlock_t lock;
> +	raw_rwlock_t lock;
>  } __raw_seqlock_t;
>  
>  #define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
> @@ -54,10 +58,10 @@ typedef __raw_seqlock_t raw_seqlock_t;
>   * OK now.  Be cautious.
>   */
>  #define __RAW_SEQLOCK_UNLOCKED(lockname) \
> -		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
> +		{ 0, RAW_RW_LOCK_UNLOCKED(lockname) }
>  
>  #ifdef CONFIG_PREEMPT_RT
> -# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
> +# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) }
>  #else
>  # define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
>  #endif
> @@ -66,10 +70,10 @@ typedef __raw_seqlock_t raw_seqlock_t;
>  		 __SEQLOCK_UNLOCKED(old_style_seqlock_init)
>  
>  #define raw_seqlock_init(x) \
> -	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
> +	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
>  
>  #define seqlock_init(x) \
> -		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
> +		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
>  
>  #define DEFINE_SEQLOCK(x) \
>  		seqlock_t x = __SEQLOCK_UNLOCKED(x)
> @@ -85,7 +89,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
>   */
>  static inline void __write_seqlock(seqlock_t *sl)
>  {
> -	spin_lock(&sl->lock);
> +	write_lock(&sl->lock);
>  	++sl->sequence;
>  	smp_wmb();
>  }
> @@ -103,14 +107,14 @@ static inline void __write_sequnlock(seqlock_t *sl)
>  {
>  	smp_wmb();
>  	sl->sequence++;
> -	spin_unlock(&sl->lock);
> +	write_unlock(&sl->lock);
>  }
>  
>  #define __write_sequnlock_irqrestore(sl, flags)	__write_sequnlock(sl)
>  
>  static inline int __write_tryseqlock(seqlock_t *sl)
>  {
> -	int ret = spin_trylock(&sl->lock);
> +	int ret = write_trylock(&sl->lock);
>  
>  	if (ret) {
>  		++sl->sequence;
> @@ -120,18 +124,25 @@ static inline int __write_tryseqlock(seqlock_t *sl)
>  }
>  
>  /* Start of read calculation -- fetch last complete writer token */
> -static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
> +static __always_inline unsigned __read_seqbegin(seqlock_t *sl)
>  {
>  	unsigned ret;
>  
> -repeat:
>  	ret = sl->sequence;
>  	smp_rmb();
>  	if (unlikely(ret & 1)) {
> -		cpu_relax();
> -		goto repeat;
> +		/*
> +		 * Serialze with the writer which will ensure they are
> +		 * pi-boosted if necessary and prevent us from starving
> +		 * them.
> +		 */
> +		read_lock(&sl->lock);
> +		ret = sl->sequence;
> +		read_unlock(&sl->lock);
>  	}
>  
> +	BUG_ON(ret & 1);
> +
>  	return ret;
>  }
>  
> @@ -142,25 +153,13 @@ repeat:
>   */
>  static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
>  {
> -	int ret;
> -
>  	smp_rmb();
> -	ret = (sl->sequence != iv);
> -	/*
> -	 * If invalid then serialize with the writer, to make sure we
> -	 * are not livelocking it:
> -	 */
> -	if (unlikely(ret)) {
> -		unsigned long flags;
> -		spin_lock_irqsave(&sl->lock, flags);
> -		spin_unlock_irqrestore(&sl->lock, flags);
> -	}
> -	return ret;
> +	return (sl->sequence != iv);
>  }
>  
>  static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl)
>  {
> -	spin_lock(&sl->lock);
> +	write_lock(&sl->lock);
>  	++sl->sequence;
>  	smp_wmb();
>  }
> @@ -191,7 +190,7 @@ static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl)
>  {
>  	smp_wmb();
>  	sl->sequence++;
> -	spin_unlock(&sl->lock);
> +	write_unlock(&sl->lock);
>  }
>  
>  static __always_inline void
> @@ -217,7 +216,7 @@ static __always_inline void __write_sequnlock_bh_raw(raw_seqlock_t *sl)
>  
>  static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
>  {
> -	int ret = spin_trylock(&sl->lock);
> +	int ret = write_trylock(&sl->lock);
>  
>  	if (ret) {
>  		++sl->sequence;
> @@ -226,18 +225,23 @@ static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
>  	return ret;
>  }
>  
> -static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
> +static __always_inline unsigned __read_seqbegin_raw(raw_seqlock_t *sl)
>  {
>  	unsigned ret;
>  
> -repeat:
>  	ret = sl->sequence;
>  	smp_rmb();
>  	if (unlikely(ret & 1)) {
> -		cpu_relax();
> -		goto repeat;
> +		/*
> +		 * Serialize with the writer
> +		 */
> +		read_lock(&sl->lock);
> +		ret = sl->sequence;
> +		read_unlock(&sl->lock);
>  	}
>  
> +	BUG_ON(ret & 1);
> +
>  	return ret;
>  }
>  

I have mixed feelings about this. The point of this primitive was for
cases where write contention was rare and writers only did small updates.
So what you did was fix the primitive for cases where it is being misused.
Do you have a real example where this is a problem? If so then the
user of seqlock should be fixed, rather than fixing seqlock.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:35           ` Gregory Haskins
  2008-08-29 16:45             ` Andi Kleen
@ 2008-08-29 16:58             ` Steven Rostedt
  1 sibling, 0 replies; 28+ messages in thread
From: Steven Rostedt @ 2008-08-29 16:58 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Andi Kleen, Gregory Haskins, mingo, tglx, linux-kernel,
	linux-rt-users


On Fri, 29 Aug 2008, Gregory Haskins wrote:
> 
> Yeah, ideas crossed in the mail ;)
> 
> I could just force all of the seqbegins to hit the slowpath by hacking
> the code and see what happens (aside from slowing down, of course ;)
> 
> Question: Which seqlock_t does userspace use?  I assume it uses
> seqlock_t and not raw_seqlock_t.  But the only reason that I ask is that
> I converted raw_seqlock_t to use the new style as well to be consistent,
> even though it is not strictly necessary for the same reasons.  So if
> perchance userspace uses the raw variant, I could solve this issue by
> only re-working the seqlock_t variant.  Kind of a long shot, but figured
> I would mention it :)

I answered this on IRC, but this is for the rest of those reading this 
thread.

Userspace (vsyscalls) can only use raw_seqlock_t. And only the read 
version for that matter. Since the read of raw_seqlock_t is just that, a 
read, no writes, and no jumping to other functions on contention.

The vsyscalls should never use the -rt seqlock_t. Not modifying the raws 
here should make us golden.

-- Steve


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:45             ` Andi Kleen
  2008-08-29 16:53               ` Steven Rostedt
@ 2008-08-29 17:00               ` Gregory Haskins
  1 sibling, 0 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 17:00 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gregory Haskins, Steven Rostedt, mingo, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 2544 bytes --]

Andi Kleen wrote:
>> I could just force all of the seqbegins to hit the slowpath by hacking
>> the code and see what happens (aside from slowing down, of course ;)
>>     
>
> Only if you don't believe it will really crash? I think it's pretty
> clear even without trying it.
>   

Well, I guess it was just to prove to myself that I broke something
because I dont understand how the vsyscall interface works.  But given
your expertise here, I have no problem with just taking your word for it.


>   
>> Question: Which seqlock_t does userspace use?  I assume it uses
>> seqlock_t and not raw_seqlock_t. 
>>     
>
>   
>> But the only reason that I ask is that
>> I converted raw_seqlock_t to use the new style as well to be consistent,
>>     
>
> There's no raw_seqlock_t anywhere in mainline?
>   
Yeah, understood.  There is both in -rt and I was just saying that we
technically only need the seqlock_t fix in -rt.  So if raw_seqlock_t
could be left pristine and solve this problem, that is an acceptable
compromise to me.


> Anyways the variable is declared (in mainline) in asm-x86/vgtod.h 
>
>   
>> even though it is not strictly necessary for the same reasons.  So if
>> perchance userspace uses the raw variant, I could solve this issue by
>> only re-working the seqlock_t variant.  Kind of a long shot, but figured
>> I would mention it :)
>>     
>
> I guess you could define a new seqlock_t which is explicitely user space
> safe. That might avoid such issues in the future. But then
> that would likely require some code duplication and be ugly.
>
> On the other hand whatever problem you fixing in the kernel
> (to be honest it's still unclear to me what the problem is)
> needs to be likely fixed for the userland lock too.
>   

Yeah, it would possibly be a problem in both cases.

The problem I am addressing only exists in -rt since it has seqlock_t
and raw_seqlock_t (with the former using preemptible spinlock_t's). 
Since the underlying seqlock_t->spinlock_t is preemptible, you can see
that one thread that does:

{
    write_seqlock();
    /* asl */
    write_sequnlock();
}

while other high-prio threads do

do { read_seqbegin(); /* asl */; } while (read_seqretry());

The readers could preempt the writer mid critical section and enter a
live-locked loop.

raw_seqlock_t (which is equivalent to a mainline seqlock_t) do not have
this problem because the spinlock acquisition inside the write_seqlock
disables preemption.

HTH

-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:53               ` Steven Rostedt
@ 2008-08-29 17:00                 ` Gregory Haskins
  0 siblings, 0 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 17:00 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andi Kleen, Gregory Haskins, mingo, tglx, linux-kernel,
	linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 1925 bytes --]

Steven Rostedt wrote:
> The subject forgot to add "RT" in the brackets.
>
> On Fri, 29 Aug 2008, Andi Kleen wrote:
>
>   
>>> I could just force all of the seqbegins to hit the slowpath by hacking
>>> the code and see what happens (aside from slowing down, of course ;)
>>>       
>> Only if you don't believe it will really crash? I think it's pretty
>> clear even without trying it.
>>
>>     
>>> Question: Which seqlock_t does userspace use?  I assume it uses
>>> seqlock_t and not raw_seqlock_t. 
>>>       
>>> But the only reason that I ask is that
>>> I converted raw_seqlock_t to use the new style as well to be consistent,
>>>       
>> There's no raw_seqlock_t anywhere in mainline?
>>     
>
> Nope, raw_seqlock_t in -rt is equivelant to seqlock_t in mainline.
>
>   
>> Anyways the variable is declared (in mainline) in asm-x86/vgtod.h 
>>
>>     
>>> even though it is not strictly necessary for the same reasons.  So if
>>> perchance userspace uses the raw variant, I could solve this issue by
>>> only re-working the seqlock_t variant.  Kind of a long shot, but figured
>>> I would mention it :)
>>>       
>> I guess you could define a new seqlock_t which is explicitely user space
>> safe. That might avoid such issues in the future. But then
>> that would likely require some code duplication and be ugly.
>>
>> On the other hand whatever problem you fixing in the kernel
>> (to be honest it's still unclear to me what the problem is)
>> needs to be likely fixed for the userland lock too.
>>     
>
> I'm not convinced that the raw_seqlocks (mainline normal seqlocks) has a 
> problem anyway.
>   
(continuing from IRC)

Agreed.   I converted them to be consistent.  Steve just told me that
userspace actually uses the raw_seqlock_t variant, so the answer is
simple.  Just leave raw_seqlock_t alone and the patch will work fine.

Thoughts?

-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [ RT PATCH] seqlock: serialize against writers
  2008-08-29 16:57 ` Stephen Hemminger
@ 2008-08-29 17:02   ` Steven Rostedt
  0 siblings, 0 replies; 28+ messages in thread
From: Steven Rostedt @ 2008-08-29 17:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Gregory Haskins, mingo, tglx, linux-kernel, linux-rt-users,
	gregory.haskins


/me adds "RT" to subject.

On Fri, 29 Aug 2008, Stephen Hemminger wrote:
> 
> I have mixed feelings about this. The point of this primitive was for
> cases where write contention was rare and writers only did small updates.
> So what you did was fix the primitive for cases where it is being misused.
> Do you have a real example where this is a problem? If so then the
> user of seqlock should be fixed, rather than fixing seqlock.
> 


I think there's some confusion here, because of the changes to raw. The 
raw_seqlock_t and mainline seqlock_t has no issue, and should not be 
touched.

What Gregory is solving is an issue in -rt where we let write seqlocks be 
preempted.  Thus this causes issues with determinism. This is a special 
case for -rt, and -rt only. It should not affect mainline in anyway.

Sorry for the noise.

-- Steve


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 16:41           ` Gregory Haskins
@ 2008-08-29 17:08             ` Andi Kleen
  0 siblings, 0 replies; 28+ messages in thread
From: Andi Kleen @ 2008-08-29 17:08 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Andi Kleen, Gregory Haskins, mingo, rostedt, tglx, linux-kernel,
	linux-rt-users


No idea on the problem itself because I don't understand it, just:

> Given that, we either need to address the const requirement
> for userspace, alter userspace's usage, or find another way to address
> the deadlock.  Any ideas?

In the worst case you can always disable the vsyscall gtod() and
thus the need for a user space safe seqlock_t, but it will
hurt some workloads significantly. 

-Andi


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RT PATCH v2] seqlock: serialize against writers
  2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
  2008-08-29 16:09 ` Andi Kleen
  2008-08-29 16:57 ` Stephen Hemminger
@ 2008-08-29 18:03 ` Gregory Haskins
  2008-08-29 18:12   ` Gregory Haskins
  2008-08-30 11:17   ` Peter Zijlstra
  2008-08-30 11:08 ` [PATCH] " Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 18:03 UTC (permalink / raw)
  To: mingo, rostedt, tglx
  Cc: linux-kernel, linux-rt-users, gregory.haskins, andi, shemminger

*Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*

Hi Ingo, Steven, Thomas,
  Please consider for -rt4.  This fixes a nasty deadlock on my systems under
  heavy load.

[
Changelog:
	v2: only touch seqlock_t because raw_seqlock_t doesn't require
	    serialization and userspace cannot modify data during a read

	v1: initial release
]

-Greg

----
seqlock: serialize against writers

Seqlocks have always advertised that readers do not "block", but this was
never really true.  Readers have always logically blocked at the head of
the critical section under contention with writers, regardless of whether
they were allowed to run code or not.

Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
have turned this into a more explicit blocking operation in mainline.
However, this change highlights a short-coming in -rt because the
normal seqlock_ts are preemptible.  This means that we can potentially
deadlock should a reader spin waiting for a write critical-section to end
while the writer is preempted.

This patch changes the internal implementation to use a rwlock and forces
the readers to serialize with the writers under contention.  This will
have the advantage that -rt seqlocks_t will sleep the reader if deadlock
were imminent, and it will pi-boost the writer to prevent inversion.

This fixes a deadlock discovered under testing where all high prioritiy
readers were hogging the cpus and preventing a writer from releasing the
lock.

Since seqlocks are designed to be used as rarely-write locks, this should
not affect the performance in the fast-path

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/seqlock.h |   51 +++++++++++++++++++++++------------------------
 1 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 345d726..605fcdb 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,9 +3,11 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. Readers block
+ * on write contention (and where applicable, pi-boost the writer).
+ * Readers without contention on entry acquire the critical section
+ * without any atomic operations, but they may have to retry if a writer
+ * enters before the critical section ends. Writers do not wait for readers.
  *
  * This is not as cache friendly as brlock. Also, this will not work
  * for data that contains pointers, because any writer could
@@ -24,6 +26,8 @@
  *
  * Based on x86_64 vsyscall gettimeofday 
  * by Keith Owens and Andrea Arcangeli
+ *
+ * Priority inheritance and live-lock avoidance by Gregory Haskins
  */
 
 #include <linux/spinlock.h>
@@ -31,7 +35,7 @@
 
 typedef struct {
 	unsigned sequence;
-	spinlock_t lock;
+	rwlock_t lock;
 } __seqlock_t;
 
 typedef struct {
@@ -57,7 +61,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
 		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
 
 #ifdef CONFIG_PREEMPT_RT
-# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) }
 #else
 # define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
 #endif
@@ -69,7 +73,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
 	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
 
 #define seqlock_init(x) \
-		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
+		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
 
 #define DEFINE_SEQLOCK(x) \
 		seqlock_t x = __SEQLOCK_UNLOCKED(x)
@@ -85,7 +89,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
  */
 static inline void __write_seqlock(seqlock_t *sl)
 {
-	spin_lock(&sl->lock);
+	write_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
@@ -103,14 +107,14 @@ static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
-	spin_unlock(&sl->lock);
+	write_unlock(&sl->lock);
 }
 
 #define __write_sequnlock_irqrestore(sl, flags)	__write_sequnlock(sl)
 
 static inline int __write_tryseqlock(seqlock_t *sl)
 {
-	int ret = spin_trylock(&sl->lock);
+	int ret = write_trylock(&sl->lock);
 
 	if (ret) {
 		++sl->sequence;
@@ -120,18 +124,25 @@ static inline int __write_tryseqlock(seqlock_t *sl)
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin(seqlock_t *sl)
 {
 	unsigned ret;
 
-repeat:
 	ret = sl->sequence;
 	smp_rmb();
 	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
+		/*
+		 * Serialze with the writer which will ensure they are
+		 * pi-boosted if necessary and prevent us from starving
+		 * them.
+		 */
+		read_lock(&sl->lock);
+		ret = sl->sequence;
+		read_unlock(&sl->lock);
 	}
 
+	BUG_ON(ret & 1);
+
 	return ret;
 }
 
@@ -142,20 +153,8 @@ repeat:
  */
 static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
 {
-	int ret;

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RT PATCH v2] seqlock: serialize against writers
  2008-08-29 18:03 ` [RT PATCH v2] " Gregory Haskins
@ 2008-08-29 18:12   ` Gregory Haskins
  2008-08-30 11:17   ` Peter Zijlstra
  1 sibling, 0 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-29 18:12 UTC (permalink / raw)
  To: andi; +Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users, shemminger

[-- Attachment #1: Type: text/plain, Size: 1401 bytes --]

Gregory Haskins wrote:
> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
>
> Hi Ingo, Steven, Thomas,
>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>   heavy load.
>
> [
> Changelog:
> 	v2: only touch seqlock_t because raw_seqlock_t doesn't require
> 	    serialization and userspace cannot modify data during a read
>
> 	v1: initial release
> ]
>   

Hi Andi,
  As it turns out, my distcc backend was an x86_64 machine running the
v1 patch and I started to notice sometime today that certain cc1 jobs
were sometimes (albeit rarely) segfaulting on me.  I noticed that before
I even published the first patch, but I chalked it up to a corrupt .o on
my NFS home.  Plus I was forgetting that the distcc machine was running
the patch, and I would probably have never made the userspace connection
had you not mentioned it.  In any case, I self-built this v2 patch with
v2 applied, and the segfaults have gone away.  So I think we know
several things:

1) You were right that this would cause an issue if the slow path is hit.
2) Steven was right that userspace must use raw_seqlock_t because it no
longer crashes with v2.
3) I am satisfied that my primary concern is still properly addressed.

Hopefully everyone is satisfied with this patch now.

Thanks for the help.  It's much appreciated!

-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] seqlock: serialize against writers
  2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
                   ` (2 preceding siblings ...)
  2008-08-29 18:03 ` [RT PATCH v2] " Gregory Haskins
@ 2008-08-30 11:08 ` Peter Zijlstra
  2008-09-02 12:45 ` [RT PATCH v3] " Gregory Haskins
  2008-09-02 13:29 ` [RT PATCH v4] " Gregory Haskins
  5 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2008-08-30 11:08 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins

On Fri, 2008-08-29 at 11:44 -0400, Gregory Haskins wrote:
> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
> 
> Hi Ingo, Steven, Thomas,
>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>   heavy load.
> 
> -Greg
> 
> ----
> 
> Seqlocks have always advertised that readers do not "block", but this was
> never really true.  Readers have always logically blocked at the head of
> the critical section under contention with writers, regardless of whether
> they were allowed to run code or not.
> 
> Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
> have turned this into a more explicit blocking operation in mainline.
> However, this change highlights a short-coming in -rt because the
> normal seqlock_ts are preemptible.  This means that we can potentially
> deadlock should a reader spin waiting for a write critical-section to end
> while the writer is preempted.

I think the technical term is livelock.

So the problem is that the write side gets preempted, and the read side
spins in a non-preemptive fashion?

Looking at the code, __read_seqbegin() doesn't disable preemption, so
even while highly inefficient when spinning against a preempted lock, it
shouldn't livelock, since the spinner can get preempted giving the
writer a chance to finish.

> This patch changes the internal implementation to use a rwlock and forces
> the readers to serialize with the writers under contention.  This will
> have the advantage that -rt seqlocks_t will sleep the reader if deadlock
> were imminent, and it will pi-boost the writer to prevent inversion.
> 
> This fixes a deadlock discovered under testing where all high prioritiy
> readers were hogging the cpus and preventing a writer from releasing the
> lock.
> 
> Since seqlocks are designed to be used as rarely-write locks, this should
> not affect the performance in the fast-path

Not quite, seqlocks never suffered the cacheline bounce rwlocks have -
which was they strongest point - so I very much not like this change.

As to the x86_64 gtod-vsyscall, that uses a raw_seqlock_t on -rt, which
is still non-preemptable and should thus not be affected by this
livelock scenario.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RT PATCH v2] seqlock: serialize against writers
  2008-08-29 18:03 ` [RT PATCH v2] " Gregory Haskins
  2008-08-29 18:12   ` Gregory Haskins
@ 2008-08-30 11:17   ` Peter Zijlstra
  2008-08-30 12:32     ` Gregory Haskins
  1 sibling, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2008-08-30 11:17 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins, andi, shemminger

On Fri, 2008-08-29 at 14:03 -0400, Gregory Haskins wrote:
> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
> 
> Hi Ingo, Steven, Thomas,
>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>   heavy load.
> 
> [
> Changelog:
> 	v2: only touch seqlock_t because raw_seqlock_t doesn't require
> 	    serialization and userspace cannot modify data during a read
> 
> 	v1: initial release
> ]
> 
> -Greg
> 
> ----
> seqlock: serialize against writers
> 
> Seqlocks have always advertised that readers do not "block", but this was
> never really true.  Readers have always logically blocked at the head of
> the critical section under contention with writers, regardless of whether
> they were allowed to run code or not.
> 
> Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
> have turned this into a more explicit blocking operation in mainline.
> However, this change highlights a short-coming in -rt because the
> normal seqlock_ts are preemptible.  This means that we can potentially
> deadlock should a reader spin waiting for a write critical-section to end
> while the writer is preempted.

Ah, the point I was missing is higher-priority realtime task, in which
case the write side will never run because it wont preempt.

> This patch changes the internal implementation to use a rwlock and forces
> the readers to serialize with the writers under contention.  This will
> have the advantage that -rt seqlocks_t will sleep the reader if deadlock
> were imminent, and it will pi-boost the writer to prevent inversion.
> 
> This fixes a deadlock discovered under testing where all high prioritiy
> readers were hogging the cpus and preventing a writer from releasing the
> lock.
> 
> Since seqlocks are designed to be used as rarely-write locks, this should
> not affect the performance in the fast-path

Still dont like this patch, once you have a rwlock you might as well go
all the way. Esp since this half-arsed construct defeats PI in certain
cases.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RT PATCH v2] seqlock: serialize against writers
  2008-08-30 11:17   ` Peter Zijlstra
@ 2008-08-30 12:32     ` Gregory Haskins
  2008-08-30 12:38       ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Gregory Haskins @ 2008-08-30 12:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins, andi, shemminger

[-- Attachment #1: Type: text/plain, Size: 2795 bytes --]

Peter Zijlstra wrote:
> On Fri, 2008-08-29 at 14:03 -0400, Gregory Haskins wrote:
>   
>> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
>>
>> Hi Ingo, Steven, Thomas,
>>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>>   heavy load.
>>
>> [
>> Changelog:
>> 	v2: only touch seqlock_t because raw_seqlock_t doesn't require
>> 	    serialization and userspace cannot modify data during a read
>>
>> 	v1: initial release
>> ]
>>
>> -Greg
>>
>> ----
>> seqlock: serialize against writers
>>
>> Seqlocks have always advertised that readers do not "block", but this was
>> never really true.  Readers have always logically blocked at the head of
>> the critical section under contention with writers, regardless of whether
>> they were allowed to run code or not.
>>
>> Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
>> have turned this into a more explicit blocking operation in mainline.
>> However, this change highlights a short-coming in -rt because the
>> normal seqlock_ts are preemptible.  This means that we can potentially
>> deadlock should a reader spin waiting for a write critical-section to end
>> while the writer is preempted.
>>     
>
> Ah, the point I was missing is higher-priority realtime task, in which
> case the write side will never run because it wont preempt.
>   

Yep
>   
>> This patch changes the internal implementation to use a rwlock and forces
>> the readers to serialize with the writers under contention.  This will
>> have the advantage that -rt seqlocks_t will sleep the reader if deadlock
>> were imminent, and it will pi-boost the writer to prevent inversion.
>>
>> This fixes a deadlock discovered under testing where all high prioritiy
>> readers were hogging the cpus and preventing a writer from releasing the
>> lock.
>>
>> Since seqlocks are designed to be used as rarely-write locks, this should
>> not affect the performance in the fast-path
>>     
>
> Still dont like this patch, once you have a rwlock you might as well go
> all the way.

Why?  A full rwlock will still be much slower since the readers will
always need an atomic op.  This construct only uses atomic ops in the
slow path under contention, which should be rare, and is thus still
superior when retries are permissible to the design.

>  Esp since this half-arsed construct defeats PI in certain
> cases.
>   

Ouch.  While I admit that you can still get into inversion scenarios
once the reader leaves the seqbegin, this is the nature of seqlocks. 
The only ways I can think of to get around this involve atomic ops in
the fast path, which I think should be avoided.  What would you suggest
otherwise?

-Greg






[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RT PATCH v2] seqlock: serialize against writers
  2008-08-30 12:32     ` Gregory Haskins
@ 2008-08-30 12:38       ` Peter Zijlstra
  2008-08-30 13:05         ` Gregory Haskins
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2008-08-30 12:38 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins, andi, shemminger

On Sat, 2008-08-30 at 08:32 -0400, Gregory Haskins wrote:
> Peter Zijlstra wrote:
> > On Fri, 2008-08-29 at 14:03 -0400, Gregory Haskins wrote:
> >   
> >> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
> >>
> >> Hi Ingo, Steven, Thomas,
> >>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
> >>   heavy load.
> >>
> >> [
> >> Changelog:
> >> 	v2: only touch seqlock_t because raw_seqlock_t doesn't require
> >> 	    serialization and userspace cannot modify data during a read
> >>
> >> 	v1: initial release
> >> ]
> >>
> >> -Greg
> >>
> >> ----
> >> seqlock: serialize against writers
> >>
> >> Seqlocks have always advertised that readers do not "block", but this was
> >> never really true.  Readers have always logically blocked at the head of
> >> the critical section under contention with writers, regardless of whether
> >> they were allowed to run code or not.
> >>
> >> Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
> >> have turned this into a more explicit blocking operation in mainline.
> >> However, this change highlights a short-coming in -rt because the
> >> normal seqlock_ts are preemptible.  This means that we can potentially
> >> deadlock should a reader spin waiting for a write critical-section to end
> >> while the writer is preempted.
> >>     
> >
> > Ah, the point I was missing is higher-priority realtime task, in which
> > case the write side will never run because it wont preempt.
> >   
> 
> Yep
> >   
> >> This patch changes the internal implementation to use a rwlock and forces
> >> the readers to serialize with the writers under contention.  This will
> >> have the advantage that -rt seqlocks_t will sleep the reader if deadlock
> >> were imminent, and it will pi-boost the writer to prevent inversion.
> >>
> >> This fixes a deadlock discovered under testing where all high prioritiy
> >> readers were hogging the cpus and preventing a writer from releasing the
> >> lock.
> >>
> >> Since seqlocks are designed to be used as rarely-write locks, this should
> >> not affect the performance in the fast-path
> >>     
> >
> > Still dont like this patch, once you have a rwlock you might as well go
> > all the way.
> 
> Why?  

Because the second point.

> A full rwlock will still be much slower since the readers will
> always need an atomic op.  This construct only uses atomic ops in the
> slow path under contention, which should be rare, and is thus still
> superior when retries are permissible to the design.
> 
> >  Esp since this half-arsed construct defeats PI in certain
> > cases.
> >   
> 
> Ouch.  While I admit that you can still get into inversion scenarios
> once the reader leaves the seqbegin, this is the nature of seqlocks. 
> The only ways I can think of to get around this involve atomic ops in
> the fast path, which I think should be avoided.  What would you suggest
> otherwise?

Since we're talking -rt here, determinism rules, so bite the bullet and
do full PI.

The only reason we made all that stuff preemptable is to gain
determinism, that also means we have to do the PI thing.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RT PATCH v2] seqlock: serialize against writers
  2008-08-30 12:38       ` Peter Zijlstra
@ 2008-08-30 13:05         ` Gregory Haskins
  0 siblings, 0 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-08-30 13:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, rostedt, tglx, linux-kernel, linux-rt-users,
	gregory.haskins, andi, shemminger

[-- Attachment #1: Type: text/plain, Size: 3586 bytes --]

Peter Zijlstra wrote:
> On Sat, 2008-08-30 at 08:32 -0400, Gregory Haskins wrote:
>   
>> Peter Zijlstra wrote:
>>     
>>> On Fri, 2008-08-29 at 14:03 -0400, Gregory Haskins wrote:
>>>   
>>>       
>>>> *Patch submitted for inclusion in PREEMPT_RT 26-rt4.  Applies to 2.6.26.3-rt3*
>>>>
>>>> Hi Ingo, Steven, Thomas,
>>>>   Please consider for -rt4.  This fixes a nasty deadlock on my systems under
>>>>   heavy load.
>>>>
>>>> [
>>>> Changelog:
>>>> 	v2: only touch seqlock_t because raw_seqlock_t doesn't require
>>>> 	    serialization and userspace cannot modify data during a read
>>>>
>>>> 	v1: initial release
>>>> ]
>>>>
>>>> -Greg
>>>>
>>>> ----
>>>> seqlock: serialize against writers
>>>>
>>>> Seqlocks have always advertised that readers do not "block", but this was
>>>> never really true.  Readers have always logically blocked at the head of
>>>> the critical section under contention with writers, regardless of whether
>>>> they were allowed to run code or not.
>>>>
>>>> Recent changes in this space (88a411c07b6fedcfc97b8dc51ae18540bd2beda0)
>>>> have turned this into a more explicit blocking operation in mainline.
>>>> However, this change highlights a short-coming in -rt because the
>>>> normal seqlock_ts are preemptible.  This means that we can potentially
>>>> deadlock should a reader spin waiting for a write critical-section to end
>>>> while the writer is preempted.
>>>>     
>>>>         
>>> Ah, the point I was missing is higher-priority realtime task, in which
>>> case the write side will never run because it wont preempt.
>>>   
>>>       
>> Yep
>>     
>>>   
>>>       
>>>> This patch changes the internal implementation to use a rwlock and forces
>>>> the readers to serialize with the writers under contention.  This will
>>>> have the advantage that -rt seqlocks_t will sleep the reader if deadlock
>>>> were imminent, and it will pi-boost the writer to prevent inversion.
>>>>
>>>> This fixes a deadlock discovered under testing where all high prioritiy
>>>> readers were hogging the cpus and preventing a writer from releasing the
>>>> lock.
>>>>
>>>> Since seqlocks are designed to be used as rarely-write locks, this should
>>>> not affect the performance in the fast-path
>>>>     
>>>>         
>>> Still dont like this patch, once you have a rwlock you might as well go
>>> all the way.
>>>       
>> Why?  
>>     
>
> Because the second point.
>
>   
>> A full rwlock will still be much slower since the readers will
>> always need an atomic op.  This construct only uses atomic ops in the
>> slow path under contention, which should be rare, and is thus still
>> superior when retries are permissible to the design.
>>
>>     
>>>  Esp since this half-arsed construct defeats PI in certain
>>> cases.
>>>   
>>>       
>> Ouch.  While I admit that you can still get into inversion scenarios
>> once the reader leaves the seqbegin, this is the nature of seqlocks. 
>> The only ways I can think of to get around this involve atomic ops in
>> the fast path, which I think should be avoided.  What would you suggest
>> otherwise?
>>     
>
> Since we're talking -rt here, determinism rules, so bite the bullet and
> do full PI.
>
> The only reason we made all that stuff preemptable is to gain
> determinism, that also means we have to do the PI thing.
>   
Yeah, you have a point.  I still think this patch will solve the
deadlock thing, so please consider it in the interim.  I will whip up a
full PI solution next week.

-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RT PATCH v3] seqlock: serialize against writers
  2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
                   ` (3 preceding siblings ...)
  2008-08-30 11:08 ` [PATCH] " Peter Zijlstra
@ 2008-09-02 12:45 ` Gregory Haskins
  2008-09-02 13:01   ` Gregory Haskins
  2008-09-02 13:29 ` [RT PATCH v4] " Gregory Haskins
  5 siblings, 1 reply; 28+ messages in thread
From: Gregory Haskins @ 2008-09-02 12:45 UTC (permalink / raw)
  To: mingo, rostedt, tglx
  Cc: linux-kernel, linux-rt-users, gregory.haskins, andi, shemminger

Hi Guys,

I realized the prologue was not sufficiently descriptive based on the
feedback I received.  Therefore, here is a V3 with a new description.
The patch content itself is identical to v2.

-Greg

---------------------------------

seqlock: serialize against writers

There are currently several problems in -rt w.r.t. seqlock objects. RT
moves mainline seqlock_t to "raw_seqlock_t", and creates a new seqlock_t
object that is fully preemptible.  Being preemptible is a great step
towards deterministic behavior, but there are a few areas that need
additional measures to protect new vulnerabilities created by the
preemptible code. For the purposes of demonstration, consider three tasks
of different priority: A, B, and C.  A is the logically highest, and C is
the lowest.  A is trying to acquire a seqlock read critical section, while
C is involved in write locks.

Problem 1) If A spins in seqbegin due to writer contention retries, it may
prevent C from running even if C currently holds the write lock.  This
is a deadlock.

Problem 2) B may preempt C, preventing it from releasing the write
critical section.  In this case, A becomes inverted behind B.

Problem 3) Lower priority tasks such as C may continually acquire the write
section which subsequently causes A to continually retry and thus fail to
make forward progress.  Since C is lower priority it ideally should not
cause delays in A. E.g. C should block if A is in a read-lock and C is <= A.

This patch addresses Problems 1 & 2, and leaves 3 for a later time.

This patch changes the internal seqlock_t implementation to substitute
a rwlock for the basic spinlock previously used, and forces the readers
to serialize with the writers under contention.  Blocking on the read_lock
simultaneously sleeps A (preventing problem 1), while boosting C to A's
priority (preventing problem 2).  Non reader-to-writer contended
acquisitions, which are the predominant mode, remain free of atomic
operations.  Therefore the fast path should not be perturbed by this
change.

This fixes a real-world deadlock discovered under testing where all
high priority readers were hogging the cpus and preventing a writer
from releasing the lock (i.e. problem 1).

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/seqlock.h |   67 +++++++++++++++++++++++------------------------
 1 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index b172277..003d6e4 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,9 +3,11 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. Readers block
+ * on write contention (and where applicable, pi-boost the writer).
+ * Readers without contention on entry acquire the critical section
+ * without any atomic operations, but they may have to retry if a writer
+ * enters before the critical section ends. Writers do not wait for readers.
  *
  * This is not as cache friendly as brlock. Also, this will not work
  * for data that contains pointers, because any writer could
@@ -24,6 +26,8 @@
  *
  * Based on x86_64 vsyscall gettimeofday 
  * by Keith Owens and Andrea Arcangeli
+ *
+ * Priority inheritance and live-lock avoidance by Gregory Haskins
  */
 
 #include <linux/spinlock.h>
@@ -31,7 +35,7 @@
 
 typedef struct {
 	unsigned sequence;
-	spinlock_t lock;
+	rwlock_t lock;
 } __seqlock_t;
 
 typedef struct {
@@ -57,7 +61,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
 		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
 
 #ifdef CONFIG_PREEMPT_RT
-# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) }
 #else
 # define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
 #endif
@@ -69,7 +73,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
 	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
 
 #define seqlock_init(x) \
-		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
+		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
 
 #define DEFINE_SEQLOCK(x) \
 		seqlock_t x = __SEQLOCK_UNLOCKED(x)
@@ -85,7 +89,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
  */
 static inline void __write_seqlock(seqlock_t *sl)
 {
-	spin_lock(&sl->lock);
+	write_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
@@ -94,12 +98,12 @@ static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
-	spin_unlock(&sl->lock);
+	write_unlock(&sl->lock);
 }
 
 static inline int __write_tryseqlock(seqlock_t *sl)
 {
-	int ret = spin_trylock(&sl->lock);
+	int ret = write_trylock(&sl->lock);
 
 	if (ret) {
 		++sl->sequence;
@@ -109,18 +113,25 @@ static inline int __write_tryseqlock(seqlock_t *sl)
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin(seqlock_t *sl)
 {
 	unsigned ret;
 
-repeat:
 	ret = sl->sequence;
 	smp_rmb();
 	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
+		/*
+		 * Serialze with the writer which will ensure they are
+		 * pi-boosted if necessary and prevent us from starving
+		 * them.
+		 */
+		read_lock(&sl->lock);
+		ret = sl->sequence;
+		read_unlock(&sl->lock);
 	}
 
+	BUG_ON(ret & 1);
+
 	return ret;
 }
 
@@ -132,20 +143,8 @@ repeat:
  */
 static inline int __read_seqretry(seqlock_t *sl, unsigned start)
 {
-	int ret;
-
 	smp_rmb();
-	ret = (sl->sequence != start);
-	/*
-	 * If invalid then serialize with the writer, to make sure we
-	 * are not livelocking it:
-	 */
-	if (unlikely(ret)) {
-		unsigned long flags;
-		spin_lock_irqsave(&sl->lock, flags);
-		spin_unlock_irqrestore(&sl->lock, flags);
-	}
-	return ret;
+	return (sl->sequence != start);
 }
 
 static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl)
@@ -173,7 +172,7 @@ static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
 	return ret;
 }
 
-static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin_raw(raw_seqlock_t *sl)
 {
 	unsigned ret;
 
@@ -188,7 +187,7 @@ repeat:
 	return ret;
 }
 
-static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned start)
+static __always_inline int __read_seqretry_raw(raw_seqlock_t *sl, unsigned start)
 {
 	smp_rmb();
 	return (sl->sequence != start);
@@ -218,12 +217,12 @@ do {								\
 	__ret;							\
 })
 
-#define PICK_SEQOP_CONST_RET(op, lock)				\
+#define PICK_SEQOP_RET(op, lock)				\
 ({								\
 	unsigned long __ret;					\
 								\
 	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
-		__ret = op##_raw((const raw_seqlock_t *)(lock));\
+		__ret = op##_raw((raw_seqlock_t *)(lock));\
 	else if (TYPE_EQUAL((lock), seqlock_t))			\
 		__ret = op((seqlock_t *)(lock));		\
 	else __ret = __bad_seqlock_type();			\
@@ -231,12 +230,12 @@ do {								\
 	__ret;							\
 })
 
-#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
+#define PICK_SEQOP2_RET(op, lock, arg)					\
  ({									\
 	unsigned long __ret;						\
 									\
 	if (TYPE_EQUAL((lock), raw_seqlock_t))				\
-		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
+		__ret = op##_raw((raw_seqlock_t *)(lock), (arg));	\
 	else if (TYPE_EQUAL((lock), seqlock_t))				\
 		__ret = op((seqlock_t *)(lock), (arg));			\
 	else __ret = __bad_seqlock_type();				\
@@ -248,8 +247,8 @@ do {								\
 #define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
 #define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
 #define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
-#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
-#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
+#define read_seqbegin(sl)	PICK_SEQOP_RET(__read_seqbegin, sl)
+#define read_seqretry(sl, iv)	PICK_SEQOP2_RET(__read_seqretry, sl, iv)
 
 /*
  * Version using sequence counter only.


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RT PATCH v3] seqlock: serialize against writers
  2008-09-02 12:45 ` [RT PATCH v3] " Gregory Haskins
@ 2008-09-02 13:01   ` Gregory Haskins
  0 siblings, 0 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-09-02 13:01 UTC (permalink / raw)
  To: mingo, rostedt, tglx; +Cc: linux-kernel, linux-rt-users, andi, shemminger

[-- Attachment #1: Type: text/plain, Size: 9291 bytes --]

Gregory Haskins wrote:
> Hi Guys,
>
> I realized the prologue was not sufficiently descriptive based on the
> feedback I received.  Therefore, here is a V3 with a new description.
> The patch content itself is identical to v2.
>   
/me grumbles.

Just realized I was in the wrong git branch when I updated this, so this
will not apply to 26.3-rt3.  I will correct and send a v4.  Sorry for
the noise.

-Greg


> -Greg
>
> ---------------------------------
>
> seqlock: serialize against writers
>
> There are currently several problems in -rt w.r.t. seqlock objects. RT
> moves mainline seqlock_t to "raw_seqlock_t", and creates a new seqlock_t
> object that is fully preemptible.  Being preemptible is a great step
> towards deterministic behavior, but there are a few areas that need
> additional measures to protect new vulnerabilities created by the
> preemptible code. For the purposes of demonstration, consider three tasks
> of different priority: A, B, and C.  A is the logically highest, and C is
> the lowest.  A is trying to acquire a seqlock read critical section, while
> C is involved in write locks.
>
> Problem 1) If A spins in seqbegin due to writer contention retries, it may
> prevent C from running even if C currently holds the write lock.  This
> is a deadlock.
>
> Problem 2) B may preempt C, preventing it from releasing the write
> critical section.  In this case, A becomes inverted behind B.
>
> Problem 3) Lower priority tasks such as C may continually acquire the write
> section which subsequently causes A to continually retry and thus fail to
> make forward progress.  Since C is lower priority it ideally should not
> cause delays in A. E.g. C should block if A is in a read-lock and C is <= A.
>
> This patch addresses Problems 1 & 2, and leaves 3 for a later time.
>
> This patch changes the internal seqlock_t implementation to substitute
> a rwlock for the basic spinlock previously used, and forces the readers
> to serialize with the writers under contention.  Blocking on the read_lock
> simultaneously sleeps A (preventing problem 1), while boosting C to A's
> priority (preventing problem 2).  Non reader-to-writer contended
> acquisitions, which are the predominant mode, remain free of atomic
> operations.  Therefore the fast path should not be perturbed by this
> change.
>
> This fixes a real-world deadlock discovered under testing where all
> high priority readers were hogging the cpus and preventing a writer
> from releasing the lock (i.e. problem 1).
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
>  include/linux/seqlock.h |   67 +++++++++++++++++++++++------------------------
>  1 files changed, 33 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index b172277..003d6e4 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -3,9 +3,11 @@
>  /*
>   * Reader/writer consistent mechanism without starving writers. This type of
>   * lock for data where the reader wants a consistent set of information
> - * and is willing to retry if the information changes.  Readers never
> - * block but they may have to retry if a writer is in
> - * progress. Writers do not wait for readers. 
> + * and is willing to retry if the information changes. Readers block
> + * on write contention (and where applicable, pi-boost the writer).
> + * Readers without contention on entry acquire the critical section
> + * without any atomic operations, but they may have to retry if a writer
> + * enters before the critical section ends. Writers do not wait for readers.
>   *
>   * This is not as cache friendly as brlock. Also, this will not work
>   * for data that contains pointers, because any writer could
> @@ -24,6 +26,8 @@
>   *
>   * Based on x86_64 vsyscall gettimeofday 
>   * by Keith Owens and Andrea Arcangeli
> + *
> + * Priority inheritance and live-lock avoidance by Gregory Haskins
>   */
>  
>  #include <linux/spinlock.h>
> @@ -31,7 +35,7 @@
>  
>  typedef struct {
>  	unsigned sequence;
> -	spinlock_t lock;
> +	rwlock_t lock;
>  } __seqlock_t;
>  
>  typedef struct {
> @@ -57,7 +61,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
>  		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
>  
>  #ifdef CONFIG_PREEMPT_RT
> -# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
> +# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) }
>  #else
>  # define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
>  #endif
> @@ -69,7 +73,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
>  	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
>  
>  #define seqlock_init(x) \
> -		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
> +		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
>  
>  #define DEFINE_SEQLOCK(x) \
>  		seqlock_t x = __SEQLOCK_UNLOCKED(x)
> @@ -85,7 +89,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
>   */
>  static inline void __write_seqlock(seqlock_t *sl)
>  {
> -	spin_lock(&sl->lock);
> +	write_lock(&sl->lock);
>  	++sl->sequence;
>  	smp_wmb();
>  }
> @@ -94,12 +98,12 @@ static inline void __write_sequnlock(seqlock_t *sl)
>  {
>  	smp_wmb();
>  	sl->sequence++;
> -	spin_unlock(&sl->lock);
> +	write_unlock(&sl->lock);
>  }
>  
>  static inline int __write_tryseqlock(seqlock_t *sl)
>  {
> -	int ret = spin_trylock(&sl->lock);
> +	int ret = write_trylock(&sl->lock);
>  
>  	if (ret) {
>  		++sl->sequence;
> @@ -109,18 +113,25 @@ static inline int __write_tryseqlock(seqlock_t *sl)
>  }
>  
>  /* Start of read calculation -- fetch last complete writer token */
> -static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
> +static __always_inline unsigned __read_seqbegin(seqlock_t *sl)
>  {
>  	unsigned ret;
>  
> -repeat:
>  	ret = sl->sequence;
>  	smp_rmb();
>  	if (unlikely(ret & 1)) {
> -		cpu_relax();
> -		goto repeat;
> +		/*
> +		 * Serialze with the writer which will ensure they are
> +		 * pi-boosted if necessary and prevent us from starving
> +		 * them.
> +		 */
> +		read_lock(&sl->lock);
> +		ret = sl->sequence;
> +		read_unlock(&sl->lock);
>  	}
>  
> +	BUG_ON(ret & 1);
> +
>  	return ret;
>  }
>  
> @@ -132,20 +143,8 @@ repeat:
>   */
>  static inline int __read_seqretry(seqlock_t *sl, unsigned start)
>  {
> -	int ret;
> -
>  	smp_rmb();
> -	ret = (sl->sequence != start);
> -	/*
> -	 * If invalid then serialize with the writer, to make sure we
> -	 * are not livelocking it:
> -	 */
> -	if (unlikely(ret)) {
> -		unsigned long flags;
> -		spin_lock_irqsave(&sl->lock, flags);
> -		spin_unlock_irqrestore(&sl->lock, flags);
> -	}
> -	return ret;
> +	return (sl->sequence != start);
>  }
>  
>  static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl)
> @@ -173,7 +172,7 @@ static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
>  	return ret;
>  }
>  
> -static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
> +static __always_inline unsigned __read_seqbegin_raw(raw_seqlock_t *sl)
>  {
>  	unsigned ret;
>  
> @@ -188,7 +187,7 @@ repeat:
>  	return ret;
>  }
>  
> -static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned start)
> +static __always_inline int __read_seqretry_raw(raw_seqlock_t *sl, unsigned start)
>  {
>  	smp_rmb();
>  	return (sl->sequence != start);
> @@ -218,12 +217,12 @@ do {								\
>  	__ret;							\
>  })
>  
> -#define PICK_SEQOP_CONST_RET(op, lock)				\
> +#define PICK_SEQOP_RET(op, lock)				\
>  ({								\
>  	unsigned long __ret;					\
>  								\
>  	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
> -		__ret = op##_raw((const raw_seqlock_t *)(lock));\
> +		__ret = op##_raw((raw_seqlock_t *)(lock));\
>  	else if (TYPE_EQUAL((lock), seqlock_t))			\
>  		__ret = op((seqlock_t *)(lock));		\
>  	else __ret = __bad_seqlock_type();			\
> @@ -231,12 +230,12 @@ do {								\
>  	__ret;							\
>  })
>  
> -#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
> +#define PICK_SEQOP2_RET(op, lock, arg)					\
>   ({									\
>  	unsigned long __ret;						\
>  									\
>  	if (TYPE_EQUAL((lock), raw_seqlock_t))				\
> -		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
> +		__ret = op##_raw((raw_seqlock_t *)(lock), (arg));	\
>  	else if (TYPE_EQUAL((lock), seqlock_t))				\
>  		__ret = op((seqlock_t *)(lock), (arg));			\
>  	else __ret = __bad_seqlock_type();				\
> @@ -248,8 +247,8 @@ do {								\
>  #define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
>  #define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
>  #define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
> -#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
> -#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
> +#define read_seqbegin(sl)	PICK_SEQOP_RET(__read_seqbegin, sl)
> +#define read_seqretry(sl, iv)	PICK_SEQOP2_RET(__read_seqretry, sl, iv)
>  
>  /*
>   * Version using sequence counter only.
>
>   



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RT PATCH v4] seqlock: serialize against writers
  2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
                   ` (4 preceding siblings ...)
  2008-09-02 12:45 ` [RT PATCH v3] " Gregory Haskins
@ 2008-09-02 13:29 ` Gregory Haskins
  5 siblings, 0 replies; 28+ messages in thread
From: Gregory Haskins @ 2008-09-02 13:29 UTC (permalink / raw)
  To: mingo, rostedt, tglx
  Cc: linux-kernel, linux-rt-users, gregory.haskins, andi, shemminger

[ here is the updated prologue rebased against the proper tree (26.3-rt3) ]

--------------------------

seqlock: serialize against writers

There are currently several problems in -rt w.r.t. seqlock objects. RT
moves mainline seqlock_t to "raw_seqlock_t", and creates a new seqlock_t
object that is fully preemptible.  Being preemptible is a great step
towards deterministic behavior, but there are a few areas that need
additional measures to protect new vulnerabilities created by the
preemptible code. For the purposes of demonstration, consider three tasks
of different priority: A, B, and C.  A is the logically highest, and C is
the lowest.  A is trying to acquire a seqlock read critical section, while
C is involved in write locks.

Problem 1) If A spins in seqbegin due to writer contention retries, it may
prevent C from running even if C currently holds the write lock.  This
is a deadlock.

Problem 2) B may preempt C, preventing it from releasing the write
critical section.  In this case, A becomes inverted behind B.

Problem 3) Lower priority tasks such as C may continually acquire the write
section which subsequently causes A to continually retry and thus fail to
make forward progress.  Since C is lower priority it ideally should not
cause delays in A. E.g. C should block if A is in a read-lock and C is <= A.

This patch addresses Problems 1 & 2, and leaves 3 for a later time.

This patch changes the internal seqlock_t implementation to substitute
a rwlock for the basic spinlock previously used, and forces the readers
to serialize with the writers under contention.  Blocking on the read_lock
simultaneously sleeps A (preventing problem 1), while boosting C to A's
priority (preventing problem 2).  Non reader-to-writer contended
acquisitions, which are the predominant mode, remain free of atomic
operations.  Therefore the fast path should not be perturbed by this
change.

This fixes a real-world deadlock discovered under testing where all
high priority readers were hogging the cpus and preventing a writer
from releasing the lock (i.e. problem 1).

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/seqlock.h |   51 +++++++++++++++++++++++------------------------
 1 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 345d726..605fcdb 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,9 +3,11 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. Readers block
+ * on write contention (and where applicable, pi-boost the writer).
+ * Readers without contention on entry acquire the critical section
+ * without any atomic operations, but they may have to retry if a writer
+ * enters before the critical section ends. Writers do not wait for readers.
  *
  * This is not as cache friendly as brlock. Also, this will not work
  * for data that contains pointers, because any writer could
@@ -24,6 +26,8 @@
  *
  * Based on x86_64 vsyscall gettimeofday 
  * by Keith Owens and Andrea Arcangeli
+ *
+ * Priority inheritance and live-lock avoidance by Gregory Haskins
  */
 
 #include <linux/spinlock.h>
@@ -31,7 +35,7 @@
 
 typedef struct {
 	unsigned sequence;
-	spinlock_t lock;
+	rwlock_t lock;
 } __seqlock_t;
 
 typedef struct {
@@ -57,7 +61,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
 		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
 
 #ifdef CONFIG_PREEMPT_RT
-# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) }
 #else
 # define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
 #endif
@@ -69,7 +73,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
 	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
 
 #define seqlock_init(x) \
-		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
+		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); rwlock_init(&(x)->lock); } while (0)
 
 #define DEFINE_SEQLOCK(x) \
 		seqlock_t x = __SEQLOCK_UNLOCKED(x)
@@ -85,7 +89,7 @@ typedef __raw_seqlock_t raw_seqlock_t;
  */
 static inline void __write_seqlock(seqlock_t *sl)
 {
-	spin_lock(&sl->lock);
+	write_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
@@ -103,14 +107,14 @@ static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
-	spin_unlock(&sl->lock);
+	write_unlock(&sl->lock);
 }
 
 #define __write_sequnlock_irqrestore(sl, flags)	__write_sequnlock(sl)
 
 static inline int __write_tryseqlock(seqlock_t *sl)
 {
-	int ret = spin_trylock(&sl->lock);
+	int ret = write_trylock(&sl->lock);
 
 	if (ret) {
 		++sl->sequence;
@@ -120,18 +124,25 @@ static inline int __write_tryseqlock(seqlock_t *sl)
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin(seqlock_t *sl)
 {
 	unsigned ret;
 
-repeat:
 	ret = sl->sequence;
 	smp_rmb();
 	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
+		/*
+		 * Serialze with the writer which will ensure they are
+		 * pi-boosted if necessary and prevent us from starving
+		 * them.
+		 */
+		read_lock(&sl->lock);
+		ret = sl->sequence;
+		read_unlock(&sl->lock);
 	}
 
+	BUG_ON(ret & 1);
+
 	return ret;
 }
 
@@ -142,20 +153,8 @@ repeat:
  */
 static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
 {
-	int ret;

^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2008-09-02 13:31 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-29 15:44 [PATCH] seqlock: serialize against writers Gregory Haskins
2008-08-29 16:09 ` Andi Kleen
2008-08-29 16:10   ` Gregory Haskins
2008-08-29 16:22     ` Andi Kleen
2008-08-29 16:26       ` Gregory Haskins
2008-08-29 16:34         ` Steven Rostedt
2008-08-29 16:35           ` Gregory Haskins
2008-08-29 16:45             ` Andi Kleen
2008-08-29 16:53               ` Steven Rostedt
2008-08-29 17:00                 ` Gregory Haskins
2008-08-29 17:00               ` Gregory Haskins
2008-08-29 16:58             ` Steven Rostedt
2008-08-29 16:29       ` Gregory Haskins
2008-08-29 16:37         ` Andi Kleen
2008-08-29 16:41           ` Gregory Haskins
2008-08-29 17:08             ` Andi Kleen
2008-08-29 16:57 ` Stephen Hemminger
2008-08-29 17:02   ` [ RT PATCH] " Steven Rostedt
2008-08-29 18:03 ` [RT PATCH v2] " Gregory Haskins
2008-08-29 18:12   ` Gregory Haskins
2008-08-30 11:17   ` Peter Zijlstra
2008-08-30 12:32     ` Gregory Haskins
2008-08-30 12:38       ` Peter Zijlstra
2008-08-30 13:05         ` Gregory Haskins
2008-08-30 11:08 ` [PATCH] " Peter Zijlstra
2008-09-02 12:45 ` [RT PATCH v3] " Gregory Haskins
2008-09-02 13:01   ` Gregory Haskins
2008-09-02 13:29 ` [RT PATCH v4] " Gregory Haskins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).