All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrew Morton <akpm@digeo.com>
To: Anton Blanchard <anton@samba.org>
Cc: green@namesys.com, linux-kernel@vger.kernel.org, hch@lst.de,
	jack@suse.cz, mason@suse.com, shemminger@osdl.org
Subject: Re: ext2 FS corruption with 2.5.59.
Date: Sun, 26 Jan 2003 03:23:01 -0800	[thread overview]
Message-ID: <20030126032301.4e0f8dc9.akpm@digeo.com> (raw)
In-Reply-To: <20030126111108.GB25001@krispykreme>

Anton Blanchard <anton@samba.org> wrote:
>
>  
> Hi,
> 
> > +static inline void i_size_write(struct inode * inode, loff_t i_size)
> > +{
> > +#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
> > +#ifdef __ARCH_HAS_GET_SET_64BIT
> > +	set_64bit((unsigned long long *) &inode->i_size, (unsigned long long) i_size);
> > +#else
> > +	inode->i_size_version2++;
> > +	wmb();
> > +	inode->i_size = i_size;
> > +	wmb();
> > +	inode->i_size_version1++;
> > +	wmb(); /* make it visible ASAP */
> > +#endif
> > +#elif BITS_PER_LONG==64 || !defined(CONFIG_SMP)
> > +	inode->i_size = i_size;
> > +#endif
> > +}
> 
> That last wmb is suspect. We dont put an wmb after a spinlock to "make it
> visible". If you think of an wmb as an ordering tag that propagates out
> through the cpu and storage hierarchy then wmb is not going to help us here.

OK.

> I guess the store could get reordered (and so delayed) a bit, but an wmb
> is relatively expensive on some architectures.
> 
> > This is actually fairly pointless, because these fields are write-mostly and
> > read-rarely.  But we need a spinlock anyway because of the concurrent
> > modifiers problem.
> 
> It would be interesting to compare a spinlock or rwlock against a frlock
> in a write mostly situation. Actually isnt it going to be slower because
> we have to take a spinlock to serialise around the frlock write path?

A little bit, if it it very write-mostly.  But the gains on the read side
will compensate.

Here is Stephen's implementation:



 linux/frlock.h |  100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 linux/time.h   |    3 +
 time.c         |   31 +++++++++--------
 timer.c        |   22 ++++++------
 4 files changed, 131 insertions(+), 25 deletions(-)

diff -puN /dev/null include/linux/frlock.h
--- /dev/null	2002-08-30 16:31:37.000000000 -0700
+++ linux-hype-akpm/include/linux/frlock.h	2003-01-25 18:35:14.000000000 -0800
@@ -0,0 +1,100 @@
+#ifndef __LINUX_FRLOCK_H
+#define __LINUX_FRLOCK_H
+
+/*
+ * Fast read-write spinlocks.
+ *
+ * Fast reader/writer locks without starving writers. This type of
+ * lock for data where the reader wants a consitent set of information
+ * and is willing to retry if the information changes.  Readers never
+ * block but they may have to retry if a writer is in
+ * progress. Writers do not wait for readers. 
+ *
+ * Generalization on sequence variables used for gettimeofday on x86-64 
+ * by Andrea Arcangeli
+ *
+ * This is not as cache friendly as brlock. Also, this will not work
+ * for data that contains pointers, because any writer could
+ * invalidate a pointer that a reader was following.
+ *
+ * 
+ * Expected reader usage:
+ * 	do {
+ *	    seq = fr_read_begin();
+ * 	...
+ *      } while (seq != fr_read_end());
+ *
+ * On non-SMP the spin locks disappear but the writer still needs
+ * to increment the sequence variables because an interrupt routine could
+ * change the state of the data.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+
+typedef struct {
+	spinlock_t lock;
+	unsigned pre_sequence;
+	unsigned post_sequence;
+} frlock_t;
+
+#define FR_LOCK_UNLOCKED	(frlock_t) { SPIN_LOCK_UNLOCKED, 0, 0 }
+#define frlock_init(x)		do { *(x) = FR_LOCK_UNLOCKED; } while (0)
+
+static inline void fr_write_lock(frlock_t *rw)
+{
+	spin_lock(&rw->lock);
+	rw->pre_sequence++;
+	wmb();
+}	
+
+static inline void fr_write_unlock(frlock_t *rw) 
+{
+	wmb();
+	rw->post_sequence++;
+	spin_unlock(&rw->lock);
+}
+
+static inline int fr_write_trylock(frlock_t *rw)
+{
+	int ret  = spin_trylock(&rw->lock);
+
+	if (ret) {
+		++rw->pre_sequence;
+		wmb();
+	}
+	return ret;
+}
+
+static inline unsigned fr_read_begin(frlock_t *rw) 
+{
+	unsigned ret = rw->post_sequence;
+	rmb();
+	return ret;
+	
+}
+
+static inline unsigned fr_read_end(frlock_t *rw)
+{
+	rmb();
+	return rw->pre_sequence;
+}
+
+/*
+ * Possible sw/hw IRQ protected versions of the interfaces.
+ */
+#define fr_write_lock_irqsave(lock, flags) \
+	do { local_irq_save(flags);	 fr_write_lock(lock); } while (0)
+#define fr_write_lock_irq(lock) \
+	do { local_irq_disable();	 fr_write_lock(lock); } while (0)
+#define fr_write_lock_bh(lock) \
+        do { local_bh_disable();	 fr_write_lock(lock); } while (0)
+
+#define fr_write_unlock_irqrestore(lock, flags)	\
+	do { fr_write_unlock(lock); local_irq_restore(flags); } while(0)
+#define fr_write_unlock_irq(lock) \
+	do { fr_write_unlock(lock); local_irq_enable(); } while(0)
+#define fr_write_unlock_bh(lock) \
+	do { fr_write_unlock(lock); local_bh_enable(); } while(0)
+
+#endif /* __LINUX_FRLOCK_H */
diff -puN include/linux/time.h~frlock-xtime include/linux/time.h
--- linux-hype/include/linux/time.h~frlock-xtime	2003-01-25 18:11:45.000000000 -0800
+++ linux-hype-akpm/include/linux/time.h	2003-01-25 18:11:45.000000000 -0800
@@ -25,6 +25,7 @@ struct timezone {
 #ifdef __KERNEL__
 
 #include <linux/spinlock.h>
+#include <linux/frlock.h>
 
 /*
  * Change timeval to jiffies, trying to avoid the
@@ -120,7 +121,7 @@ mktime (unsigned int year, unsigned int 
 }
 
 extern struct timespec xtime;
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
 { 
diff -puN kernel/time.c~frlock-xtime kernel/time.c
--- linux-hype/kernel/time.c~frlock-xtime	2003-01-25 18:11:45.000000000 -0800
+++ linux-hype-akpm/kernel/time.c	2003-01-25 18:11:45.000000000 -0800
@@ -27,7 +27,6 @@
 #include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
-
 #include <asm/uaccess.h>
 
 /* 
@@ -38,7 +37,7 @@ struct timezone sys_tz;
 
 /* The xtime_lock is not only serializing the xtime read/writes but it's also
    serializing all accesses to the global NTP variables now. */
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
 extern unsigned long last_time_offset;
 
 #if !defined(__alpha__) && !defined(__ia64__)
@@ -80,7 +79,7 @@ asmlinkage long sys_stime(int * tptr)
 		return -EPERM;
 	if (get_user(value, tptr))
 		return -EFAULT;
-	write_lock_irq(&xtime_lock);
+	fr_write_lock_irq(&xtime_lock);
 	xtime.tv_sec = value;
 	xtime.tv_nsec = 0;
 	last_time_offset = 0;
@@ -88,7 +87,7 @@ asmlinkage long sys_stime(int * tptr)
 	time_status |= STA_UNSYNC;
 	time_maxerror = NTP_PHASE_LIMIT;
 	time_esterror = NTP_PHASE_LIMIT;
-	write_unlock_irq(&xtime_lock);
+	fr_write_unlock_irq(&xtime_lock);
 	return 0;
 }
 
@@ -96,13 +95,13 @@ asmlinkage long sys_stime(int * tptr)
 
 asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
-	if (tv) {
+	if (likely(tv != NULL)) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
 		if (copy_to_user(tv, &ktv, sizeof(ktv)))
 			return -EFAULT;
 	}
-	if (tz) {
+	if (unlikely(tz != NULL)) {
 		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
 			return -EFAULT;
 	}
@@ -127,10 +126,10 @@ asmlinkage long sys_gettimeofday(struct 
  */
 inline static void warp_clock(void)
 {
-	write_lock_irq(&xtime_lock);
+	fr_write_lock_irq(&xtime_lock);
 	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
 	last_time_offset = 0;
-	write_unlock_irq(&xtime_lock);
+	fr_write_unlock_irq(&xtime_lock);
 }
 
 /*
@@ -235,7 +234,7 @@ int do_adjtimex(struct timex *txc)
 		    txc->tick > 1100000/USER_HZ)
 			return -EINVAL;
 
-	write_lock_irq(&xtime_lock);
+	fr_write_lock_irq(&xtime_lock);
 	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
@@ -386,7 +385,7 @@ leave:	if ((time_status & (STA_UNSYNC|ST
 	txc->errcnt	   = pps_errcnt;
 	txc->stbcnt	   = pps_stbcnt;
 	last_time_offset = 0;
-	write_unlock_irq(&xtime_lock);
+	fr_write_unlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
 	return(result);
 }
@@ -409,9 +408,13 @@ asmlinkage long sys_adjtimex(struct time
 struct timespec current_kernel_time(void)
 {
         struct timespec now;
-        unsigned long flags;
-        read_lock_irqsave(&xtime_lock,flags);
-	now = xtime;
-        read_unlock_irqrestore(&xtime_lock,flags);
+        unsigned long seq;
+
+	do {
+		seq = fr_read_begin(&xtime_lock);
+		
+		now = xtime;
+	} while (seq != fr_read_end(&xtime_lock));
+
 	return now; 
 }
diff -puN kernel/timer.c~frlock-xtime kernel/timer.c
--- linux-hype/kernel/timer.c~frlock-xtime	2003-01-25 18:11:45.000000000 -0800
+++ linux-hype-akpm/kernel/timer.c	2003-01-25 18:11:45.000000000 -0800
@@ -758,7 +758,7 @@ unsigned long wall_jiffies;
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
  */
-rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED;
+frlock_t xtime_lock __cacheline_aligned_in_smp = FR_LOCK_UNLOCKED;
 unsigned long last_time_offset;
 
 /*
@@ -798,8 +798,7 @@ static inline void update_times(void)
 }
   
 /*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without holding read_lock_irq(&xtime_lock).
+ * The 64-bit jiffies value is not atomic 
  * jiffies is defined in the linker script...
  */
 
@@ -1087,18 +1086,21 @@ asmlinkage long sys_sysinfo(struct sysin
 	struct sysinfo val;
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
+	unsigned long seq;
 
 	memset((char *)&val, 0, sizeof(struct sysinfo));
 
-	read_lock_irq(&xtime_lock);
-	val.uptime = jiffies / HZ;
+	do {
+		seq = fr_read_begin(&xtime_lock);
 
-	val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-	val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-	val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+		val.uptime = jiffies / HZ;
 
-	val.procs = nr_threads;
-	read_unlock_irq(&xtime_lock);
+		val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+		val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+		val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+		val.procs = nr_threads;
+	} while (seq != fr_read_end(&xtime_lock));
 
 	si_meminfo(&val);
 	si_swapinfo(&val);

_


  reply	other threads:[~2003-01-26 11:13 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-01-23 12:38 ext2 FS corruption with 2.5.59 Oleg Drokin
2003-01-23 14:09 ` Hugh Dickins
2003-01-23 14:26   ` Oleg Drokin
2003-01-23 14:39     ` Oleg Drokin
2003-01-24 10:32 ` Andrew Morton
2003-01-24 12:39   ` Oleg Drokin
2003-01-25  6:53     ` Andrew Morton
2003-01-25 12:36       ` Oleg Drokin
2003-01-25 23:13         ` Andrew Morton
2003-01-26  9:25           ` Oleg Drokin
2003-01-26  3:04         ` Andrew Morton
2003-01-26  3:28           ` William Lee Irwin III
2003-01-26  3:46             ` Andrew Morton
2003-01-26  4:14               ` William Lee Irwin III
2003-01-26  5:10                 ` Andrew Morton
2003-01-27 22:59                   ` Stephen Hemminger
2003-01-27 23:59                     ` William Lee Irwin III
2003-01-26 11:11           ` Anton Blanchard
2003-01-26 11:23             ` Andrew Morton [this message]
2003-01-28 13:50   ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20030126032301.4e0f8dc9.akpm@digeo.com \
    --to=akpm@digeo.com \
    --cc=anton@samba.org \
    --cc=green@namesys.com \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mason@suse.com \
    --cc=shemminger@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.