[rfc][patch 1/2] mnt_want

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [rfc][patch 1/2] mnt_want_write speedup 1
@ 2008-12-19  6:19 Nick Piggin
  2008-12-19  6:20 ` [rfc][patch 2/2] mnt_want_write speedup 2 Nick Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Nick Piggin @ 2008-12-19  6:19 UTC (permalink / raw)
  To: Dave Hansen, Linux Memory Management List, linux-fsdevel

Hi. Fun, chasing down performance regressions.... I wonder what people think
about these patches? Is it OK to bloat struct vfsmount? Any races? This could
be made even faster if mnt_make_readonly could tolerate a really high latency
synchronize_rcu()... can it?)

--
This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.

Before:
 avg = 501.9
 std = 14.7773

After:
 avg = 462.286
 std = 5.46106

(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)

It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.

It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.

---
 fs/namespace.c        |  251 +++++++++++++++-----------------------------------
 include/linux/mount.h |   21 +++-
 2 files changed, 94 insertions(+), 178 deletions(-)

Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c
+++ linux-2.6/fs/namespace.c
@@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
-		atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+		mnt->mnt_writers = alloc_percpu(int);
+		if (!mnt->mnt_writers)
+			goto out_free_devname;
+#else
+		mnt->mnt_writers = 0;
+#endif
 	}
 	return mnt;
 
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree(mnt->mnt_devname);
+#endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
@@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-struct mnt_writer {
-	/*
-	 * If holding multiple instances of this lock, they
-	 * must be ordered by cpu number.
-	 */
-	spinlock_t lock;
-	struct lock_class_key lock_class; /* compiles out with !lockdep */
-	unsigned long count;
-	struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+	mnt->mnt_writers++;
+#endif
+}
 
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
-		spin_lock_init(&writer->lock);
-		lockdep_set_class(&writer->lock, &writer->lock_class);
-		writer->count = 0;
-	}
-	return 0;
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+	mnt->mnt_writers--;
+#endif
 }
-fs_initcall(init_mnt_writers);
 
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
 	int cpu;
-	struct mnt_writer *cpu_writer;
 
 	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_unlock(&cpu_writer->lock);
+		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
 	}
-}
 
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
-	if (!cpu_writer->mnt)
-		return;
-	/*
-	 * This is in case anyone ever leaves an invalid,
-	 * old ->mnt and a count of 0.
-	 */
-	if (!cpu_writer->count)
-		return;
-	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-	cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-					  struct vfsmount *mnt)
-{
-	if (cpu_writer->mnt == mnt)
-		return;
-	__clear_mnt_count(cpu_writer);
-	cpu_writer->mnt = mnt;
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
 }
 
 /*
@@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
 int mnt_want_write(struct vfsmount *mnt)
 {
 	int ret = 0;
-	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	/*
+	 * The store to inc_mnt_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+		cpu_relax();
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
+		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	cpu_writer->count++;
 out:
-	spin_unlock(&cpu_writer->lock);
-	put_cpu_var(mnt_writers);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-static void lock_mnt_writers(void)
-{
-	int cpu;
-	struct mnt_writer *cpu_writer;
-
-	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		__clear_mnt_count(cpu_writer);
-		cpu_writer->mnt = NULL;
-	}
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count.  Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
-	if (atomic_read(&mnt->__mnt_writers) >=
-	    MNT_WRITER_UNDERFLOW_LIMIT)
-		return;
-	/*
-	 * It isn't necessary to hold all of the locks
-	 * at the same time, but doing it this way makes
-	 * us share a lot more code.
-	 */
-	lock_mnt_writers();
-	/*
-	 * vfsmount_lock is for mnt_flags.
-	 */
-	spin_lock(&vfsmount_lock);
-	/*
-	 * If coalescing the per-cpu writer counts did not
-	 * get us back to a positive writer count, we have
-	 * a bug.
-	 */
-	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-				"count: %d\n",
-			mnt, atomic_read(&mnt->__mnt_writers));
-		/* use the flag to keep the dmesg spam down */
-		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-	}
-	spin_unlock(&vfsmount_lock);
-	unlock_mnt_writers();
-}
-
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -331,37 +273,9 @@ static void handle_write_count_underflow
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	int must_check_underflow = 0;
-	struct mnt_writer *cpu_writer;
-
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
-
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	if (cpu_writer->count > 0) {
-		cpu_writer->count--;
-	} else {
-		must_check_underflow = 1;
-		atomic_dec(&mnt->__mnt_writers);
-	}
-
-	spin_unlock(&cpu_writer->lock);
-	/*
-	 * Logically, we could call this each time,
-	 * but the __mnt_writers cacheline tends to
-	 * be cold, and makes this expensive.
-	 */
-	if (must_check_underflow)
-		handle_write_count_underflow(mnt);
-	/*
-	 * This could be done right after the spinlock
-	 * is taken because the spinlock keeps us on
-	 * the cpu, and disables preemption.  However,
-	 * putting it here bounds the amount that
-	 * __mnt_writers can underflow.  Without it,
-	 * we could theoretically wrap __mnt_writers.
-	 */
-	put_cpu_var(mnt_writers);
+	preempt_disable();
+	dec_mnt_writers(mnt);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
@@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
 {
 	int ret = 0;
 
-	lock_mnt_writers();
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
-	 * With all the locks held, this value is stable
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
 	 */
-	if (atomic_read(&mnt->__mnt_writers) > 0) {
+	smp_mb();
+
+	/*
+	 * With writers on hold, if this value is zero, then there are definitely
+	 * no active writers (although held writers may subsequently increment
+	 * the count, they'll have to wait, and decrement it after seeing
+	 * MNT_READONLY).
+	 */
+	if (count_mnt_writers(mnt) > 0) {
 		ret = -EBUSY;
 		goto out;
 	}
-	/*
-	 * nobody can do a successful mnt_want_write() with all
-	 * of the counts in MNT_DENIED_WRITE and the locks held.
-	 */
-	spin_lock(&vfsmount_lock);
 	if (!ret)
 		mnt->mnt_flags |= MNT_READONLY;
-	spin_unlock(&vfsmount_lock);
 out:
-	unlock_mnt_writers();
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	spin_unlock(&vfsmount_lock);
 	return ret;
 }
 
@@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_writers);
+#endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
@@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
 
 static inline void __mntput(struct vfsmount *mnt)
 {
-	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
 	/*
-	 * We don't have to hold all of the locks at the
-	 * same time here because we know that we're the
-	 * last reference to mnt and that no new writers
-	 * can come in.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		if (cpu_writer->mnt != mnt)
-			continue;
-		spin_lock(&cpu_writer->lock);
-		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-		cpu_writer->count = 0;
-		/*
-		 * Might as well do this so that no one
-		 * ever sees the pointer and expects
-		 * it to be valid.
-		 */
-		cpu_writer->mnt = NULL;
-		spin_unlock(&cpu_writer->lock);
-	}
-	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
-	WARN_ON(atomic_read(&mnt->__mnt_writers));
+	WARN_ON(count_mnt_writers(mnt));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
Index: linux-2.6/include/linux/mount.h
===================================================================
--- linux-2.6.orig/include/linux/mount.h
+++ linux-2.6/include/linux/mount.h
@@ -30,6 +30,7 @@ struct mnt_namespace;
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
+#define MNT_WRITE_HOLD	0x400
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -64,13 +65,23 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
-	/*
-	 * This value is not stable unless all of the mnt_writers[] spinlocks
-	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
-	 */
-	atomic_t __mnt_writers;
+
+#ifdef CONFIG_SMP
+	int *mnt_writers;
+#else
+	int mnt_writers;
+#endif
 };
 
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	return mnt->mnt_writers;
+#else
+	return &mnt->mnt_writers;
+#endif
+}
+
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [rfc][patch 2/2] mnt_want_write speedup 2
  2008-12-19  6:19 [rfc][patch 1/2] mnt_want_write speedup 1 Nick Piggin
@ 2008-12-19  6:20 ` Nick Piggin
  2008-12-19  6:34 ` [rfc][patch 1/2] mnt_want_write speedup 1 Dave Hansen
  2008-12-19  6:54 ` Dave Hansen
  2 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2008-12-19  6:20 UTC (permalink / raw)
  To: Dave Hansen, Linux Memory Management List, linux-fsdevel


This patch speeds up lmbench lat_mmap test by about another 2% after the
first patch.

Before:
 avg = 462.286
 std = 5.46106

After:
 avg = 453.12
 std = 9.58257

(50 runs of each, stddev gives a reasonable confidence)

It does this by introducing mnt_clone_write, which avoids some heavyweight
operations of mnt_want_write if called on a vfsmount which we know already
has a write count; and mnt_want_write_file, which can call mnt_clone_write
if the file is open for write.

After these two patches, mnt_want_write and mnt_drop_write go from 7% on
the profile down to 1.3% (including mnt_clone_write).

---
 fs/file_table.c       |    3 +--
 fs/inode.c            |    2 +-
 fs/namespace.c        |   38 ++++++++++++++++++++++++++++++++++++++
 fs/open.c             |    4 ++--
 fs/xattr.c            |    4 ++--
 include/linux/mount.h |    2 ++
 6 files changed, 46 insertions(+), 7 deletions(-)

Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c
+++ linux-2.6/fs/file_table.c
@@ -210,8 +210,7 @@ int init_file(struct file *file, struct
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
 		file_take_write(file);
-		error = mnt_want_write(mnt);
-		WARN_ON(error);
+		mnt_clone_write(mnt);
 	}
 	return error;
 }
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c
+++ linux-2.6/fs/inode.c
@@ -1249,7 +1249,7 @@ void file_update_time(struct file *file)
 	if (IS_NOCMTIME(inode))
 		return;
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file->f_path.mnt, file);
 	if (err)
 		return;
 
Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c
+++ linux-2.6/fs/namespace.c
@@ -264,6 +264,44 @@ out:
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+void mnt_clone_write(struct vfsmount *mnt)
+{
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct vfsmount *mnt, struct file *file)
+{
+	if (!(file->f_mode & FMODE_WRITE))
+		return mnt_want_write(mnt);
+	else {
+		mnt_clone_write(mnt);
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
+/**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
Index: linux-2.6/fs/open.c
===================================================================
--- linux-2.6.orig/fs/open.c
+++ linux-2.6/fs/open.c
@@ -597,7 +597,7 @@ asmlinkage long sys_fchmod(unsigned int
 
 	audit_inode(NULL, dentry);
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file->f_path.mnt, file);
 	if (err)
 		goto out_putf;
 	mutex_lock(&inode->i_mutex);
@@ -748,7 +748,7 @@ asmlinkage long sys_fchown(unsigned int
 	if (!file)
 		goto out;
 
-	error = mnt_want_write(file->f_path.mnt);
+	error = mnt_want_write_file(file->f_path.mnt, file);
 	if (error)
 		goto out_fput;
 	dentry = file->f_path.dentry;
Index: linux-2.6/fs/xattr.c
===================================================================
--- linux-2.6.orig/fs/xattr.c
+++ linux-2.6/fs/xattr.c
@@ -302,7 +302,7 @@ sys_fsetxattr(int fd, const char __user
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f->f_path.mnt, f);
 	if (!error) {
 		error = setxattr(dentry, name, value, size, flags);
 		mnt_drop_write(f->f_path.mnt);
@@ -533,7 +533,7 @@ sys_fremovexattr(int fd, const char __us
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f->f_path.mnt, f);
 	if (!error) {
 		error = removexattr(dentry, name);
 		mnt_drop_write(f->f_path.mnt);
Index: linux-2.6/include/linux/mount.h
===================================================================
--- linux-2.6.orig/include/linux/mount.h
+++ linux-2.6/include/linux/mount.h
@@ -90,6 +90,8 @@ static inline struct vfsmount *mntget(st
 }
 
 extern int mnt_want_write(struct vfsmount *mnt);
+extern int mnt_want_write_file(struct vfsmount *mnt, struct file *file);
+extern void mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19  6:19 [rfc][patch 1/2] mnt_want_write speedup 1 Nick Piggin
  2008-12-19  6:20 ` [rfc][patch 2/2] mnt_want_write speedup 2 Nick Piggin
@ 2008-12-19  6:34 ` Dave Hansen
  2008-12-19  6:52   ` Nick Piggin
  2008-12-19  6:54 ` Dave Hansen
  2 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2008-12-19  6:34 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List, linux-fsdevel

On Fri, 2008-12-19 at 07:19 +0100, Nick Piggin wrote:
> Hi. Fun, chasing down performance regressions.... I wonder what people think
> about these patches? Is it OK to bloat struct vfsmount? Any races?

Very cool stuff, Nick.  I especially like how much it simplifies things
and removes *SO* much code.

Bloating the vfsmount was one of the things that really, really tried to
avoid.  When I start to think about the SGI machines, it gets me really
worried.  I went to a lot of trouble to make sure that the per-vfsmount
memory overhead didn't scale with the number of cpus.

> This could
> be made even faster if mnt_make_readonly could tolerate a really high latency
> synchronize_rcu()... can it?)

Yes, I think it can tolerate it.  There's a lot of work to do, and we
already have to go touch all the other per-cpu objects.  There also
tends to be writeout when this happens, so I don't think a few seconds,
even, will be noticed.

> This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
> basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
> A microbenchmark yes, but it exercises some important paths in the mm.

Do you know where the overhead actually came from?  Was it the
spinlocks?  Was removing all the atomic ops what really helped?

I'll take a more in-depth look at your code tomorrow and see if I see
any races.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19  6:34 ` [rfc][patch 1/2] mnt_want_write speedup 1 Dave Hansen
@ 2008-12-19  6:52   ` Nick Piggin
  2008-12-19  6:56     ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2008-12-19  6:52 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Memory Management List, linux-fsdevel

On Thu, Dec 18, 2008 at 10:34:52PM -0800, Dave Hansen wrote:
> On Fri, 2008-12-19 at 07:19 +0100, Nick Piggin wrote:
> > Hi. Fun, chasing down performance regressions.... I wonder what people think
> > about these patches? Is it OK to bloat struct vfsmount? Any races?
> 
> Very cool stuff, Nick.  I especially like how much it simplifies things
> and removes *SO* much code.

Thanks.

> Bloating the vfsmount was one of the things that really, really tried to
> avoid.  When I start to think about the SGI machines, it gets me really
> worried.  I went to a lot of trouble to make sure that the per-vfsmount
> memory overhead didn't scale with the number of cpus.

Well, OTOH, the SGI machines have a lot of memory ;) I *think* that
not many systems probably have thousands of mounts (given that the
mount hashtable is fixed sized single page), but I might be wrong
which is why I ask here.

Let's say a 4096 CPU machine with one mount for each CPU (4096 mounts),
I think should only use about 128MB total for the counters. OK, yes
that is a lot ;) but not exactly insane for such machine size.

Say for 32 CPU system with 10,000 mounts, it's 9MB.

> > This could
> > be made even faster if mnt_make_readonly could tolerate a really high latency
> > synchronize_rcu()... can it?)
> 
> Yes, I think it can tolerate it.  There's a lot of work to do, and we
> already have to go touch all the other per-cpu objects.  There also
> tends to be writeout when this happens, so I don't think a few seconds,
> even, will be noticed.

That would be good. After the first patch, mnt_want_write still shows up
on profiles and almost oall the hits come right after the msync from
the smp_mb there.

It would be really nice to use RCU here. I think it might allow us to
eliminate the memory barriers.

> > This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
> > basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
> > A microbenchmark yes, but it exercises some important paths in the mm.
> 
> Do you know where the overhead actually came from?  Was it the
> spinlocks?  Was removing all the atomic ops what really helped?

I thnk about 95% of the unhalted cycles were hit against the two
instructions after the call to spin_lock. It wasn't actually flipping 
the write counter per-cpu cache as far as I could see. I didn't save
the instruction level profiles, but I'll do another run if people
think it will be sane to use RCU here.

> I'll take a more in-depth look at your code tomorrow and see if I see
> any races.

Thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19  6:19 [rfc][patch 1/2] mnt_want_write speedup 1 Nick Piggin
  2008-12-19  6:20 ` [rfc][patch 2/2] mnt_want_write speedup 2 Nick Piggin
  2008-12-19  6:34 ` [rfc][patch 1/2] mnt_want_write speedup 1 Dave Hansen
@ 2008-12-19  6:54 ` Dave Hansen
  2008-12-19  7:03   ` Nick Piggin
  2 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2008-12-19  6:54 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List, linux-fsdevel

On Fri, 2008-12-19 at 07:19 +0100, Nick Piggin wrote:
> @@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
>  {
>         int ret = 0;
> 
> -       lock_mnt_writers();
> +       spin_lock(&vfsmount_lock);
> +       mnt->mnt_flags |= MNT_WRITE_HOLD;
>         /*
> -        * With all the locks held, this value is stable
> +        * After storing MNT_WRITE_HOLD, we'll read the counters. This store
> +        * should be visible before we do.
>          */
> -       if (atomic_read(&mnt->__mnt_writers) > 0) {
> +       smp_mb();
> +
> +       /*
> +        * With writers on hold, if this value is zero, then there are definitely
> +        * no active writers (although held writers may subsequently increment
> +        * the count, they'll have to wait, and decrement it after seeing
> +        * MNT_READONLY).
> +        */
> +       if (count_mnt_writers(mnt) > 0) {
>                 ret = -EBUSY;

OK, I think this is one of the big races inherent with this approach.
There's nothing in here to ensure that no one is in the middle of an
update during this code.  The preempt_disable() will, of course, reduce
the window, but I think there's still a race here.

Is this where you wanted to put the synchronize_rcu()?  That's a nice
touch because although *that* will ensure that no one is in the middle
of an increment here and that they will, at worst, be blocking on the
MNT_WRITE_HOLD thing.

I kinda remember going down this path a few times, bu you may have
cracked the problem.  Dunno.  I need to stare at the code a bit more
before I'm convinced.  I'm optimistic, but a bit skeptical this can
work. :)

I am really wondering where all the cost is that you're observing in
those benchmarks.  Have you captured any profiles by chance?

-- Dave

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19  6:52   ` Nick Piggin
@ 2008-12-19  6:56     ` Nick Piggin
  0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2008-12-19  6:56 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Memory Management List, linux-fsdevel

On Fri, Dec 19, 2008 at 07:52:42AM +0100, Nick Piggin wrote:
> On Thu, Dec 18, 2008 at 10:34:52PM -0800, Dave Hansen wrote:
> > Yes, I think it can tolerate it.  There's a lot of work to do, and we
> > already have to go touch all the other per-cpu objects.  There also
> > tends to be writeout when this happens, so I don't think a few seconds,
> > even, will be noticed.
> 
> That would be good. After the first patch, mnt_want_write still shows up
> on profiles and almost oall the hits come right after the msync from
> the smp_mb there.
> 
> It would be really nice to use RCU here. I think it might allow us to
> eliminate the memory barriers.

Actually we might be able to use a seqcounter to eliminate the most
expensive (smp_mb()) barrier. But that's more code and adds a couple
of smp_rmb()s which would be slower on some architectures.... Not to
mention more code and branches.

But I'll investigate that option if RCU is ruled out.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19  6:54 ` Dave Hansen
@ 2008-12-19  7:03   ` Nick Piggin
  2008-12-19 15:32     ` Dave Hansen
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2008-12-19  7:03 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Memory Management List, linux-fsdevel

On Thu, Dec 18, 2008 at 10:54:57PM -0800, Dave Hansen wrote:
> On Fri, 2008-12-19 at 07:19 +0100, Nick Piggin wrote:
> > @@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
> >  {
> >         int ret = 0;
> > 
> > -       lock_mnt_writers();
> > +       spin_lock(&vfsmount_lock);
> > +       mnt->mnt_flags |= MNT_WRITE_HOLD;
> >         /*
> > -        * With all the locks held, this value is stable
> > +        * After storing MNT_WRITE_HOLD, we'll read the counters. This store
> > +        * should be visible before we do.
> >          */
> > -       if (atomic_read(&mnt->__mnt_writers) > 0) {
> > +       smp_mb();
> > +
> > +       /*
> > +        * With writers on hold, if this value is zero, then there are definitely
> > +        * no active writers (although held writers may subsequently increment
> > +        * the count, they'll have to wait, and decrement it after seeing
> > +        * MNT_READONLY).
> > +        */
> > +       if (count_mnt_writers(mnt) > 0) {
> >                 ret = -EBUSY;
> 
> OK, I think this is one of the big races inherent with this approach.
> There's nothing in here to ensure that no one is in the middle of an
> update during this code.  The preempt_disable() will, of course, reduce
> the window, but I think there's still a race here.

MNT_WRITE_HOLD is set, so any writer that has already made it past
the MNT_WANT_WRITE loop will have its count visible here. Any writer
that has not made it past that loop will wait until the slowpath
completes and then the fastpath will go on to check whether the
mount is still writeable.


> Is this where you wanted to put the synchronize_rcu()?  That's a nice
> touch because although *that* will ensure that no one is in the middle
> of an increment here and that they will, at worst, be blocking on the
> MNT_WRITE_HOLD thing.

Basically the synchronize_rcu would go in place of the smp_mb() here,
and it would automatically eliminate the corresponding smp_mb() in
the fastpath (because a quiescent state on a CPU is guaranteed to
include a barrier).

 
> I kinda remember going down this path a few times, bu you may have
> cracked the problem.  Dunno.  I need to stare at the code a bit more
> before I'm convinced.  I'm optimistic, but a bit skeptical this can
> work. :)
> 
> I am really wondering where all the cost is that you're observing in
> those benchmarks.  Have you captured any profiles by chance?

Yes, as I said, the cycles seem to be in the spin_lock instructions.
It's hard to see _exactly_ what's going on with oprofile and an out
of order CPU, but the cycles as I said are all right after spin_lock
returns. 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19  7:03   ` Nick Piggin
@ 2008-12-19 15:32     ` Dave Hansen
  2008-12-22  4:35       ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2008-12-19 15:32 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List, linux-fsdevel

On Fri, 2008-12-19 at 08:03 +0100, Nick Piggin wrote:
> On Thu, Dec 18, 2008 at 10:54:57PM -0800, Dave Hansen wrote:
> > On Fri, 2008-12-19 at 07:19 +0100, Nick Piggin wrote:
> > > @@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
> > >  {
> > >         int ret = 0;
> > > 
> > > -       lock_mnt_writers();
> > > +       spin_lock(&vfsmount_lock);
> > > +       mnt->mnt_flags |= MNT_WRITE_HOLD;
> > >         /*
> > > -        * With all the locks held, this value is stable
> > > +        * After storing MNT_WRITE_HOLD, we'll read the counters. This store
> > > +        * should be visible before we do.
> > >          */
> > > -       if (atomic_read(&mnt->__mnt_writers) > 0) {
> > > +       smp_mb();
> > > +
> > > +       /*
> > > +        * With writers on hold, if this value is zero, then there are definitely
> > > +        * no active writers (although held writers may subsequently increment
> > > +        * the count, they'll have to wait, and decrement it after seeing
> > > +        * MNT_READONLY).
> > > +        */
> > > +       if (count_mnt_writers(mnt) > 0) {
> > >                 ret = -EBUSY;
> > 
> > OK, I think this is one of the big races inherent with this approach.
> > There's nothing in here to ensure that no one is in the middle of an
> > update during this code.  The preempt_disable() will, of course, reduce
> > the window, but I think there's still a race here.
> 
> MNT_WRITE_HOLD is set, so any writer that has already made it past
> the MNT_WANT_WRITE loop will have its count visible here. Any writer
> that has not made it past that loop will wait until the slowpath
> completes and then the fastpath will go on to check whether the
> mount is still writeable.

Ahh, got it.  I'm slowly absorbing the barriers.  Not the normal way, I
code.

I thought there was another race with MNT_WRITE_HOLD since mnt_flags
isn't really managed atomically.  But, by only modifying with the
vfsmount_lock, I think it is OK.

I also wondered if there was a possibility of getting a spurious -EBUSY
when remounting r/w->r/o.  But, that turned out to just happen when the
fs was *already* r/o.  So that looks good.

While this has cleared out a huge amount of complexity, I can't stop
wondering if this could be done with a wee bit more "normal" operations.
I'm pretty sure I couldn't have come up with this by myself, and I'm a
bit worried that I wouldn't be able to find a race in it if one reared
its ugly head.  

Is there a real good reason to allocate the percpu counters dynamically?
Might as well stick them in the vfsmount and let the one
kmem_cache_zalloc() in alloc_vfsmnt() do a bit larger of an allocation.
Did you think that was going to bloat it to a compound allocation or
something?  I hate the #ifdefs. :)

-- Dave

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-19 15:32     ` Dave Hansen
@ 2008-12-22  4:35       ` Nick Piggin
  2008-12-29 23:00         ` Dave Hansen
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2008-12-22  4:35 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Memory Management List, linux-fsdevel

On Fri, Dec 19, 2008 at 07:32:01AM -0800, Dave Hansen wrote:
> On Fri, 2008-12-19 at 08:03 +0100, Nick Piggin wrote:
> > MNT_WRITE_HOLD is set, so any writer that has already made it past
> > the MNT_WANT_WRITE loop will have its count visible here. Any writer
> > that has not made it past that loop will wait until the slowpath
> > completes and then the fastpath will go on to check whether the
> > mount is still writeable.
> 
> Ahh, got it.  I'm slowly absorbing the barriers.  Not the normal way, I
> code.
> 
> I thought there was another race with MNT_WRITE_HOLD since mnt_flags
> isn't really managed atomically.  But, by only modifying with the
> vfsmount_lock, I think it is OK.
> 
> I also wondered if there was a possibility of getting a spurious -EBUSY
> when remounting r/w->r/o.  But, that turned out to just happen when the
> fs was *already* r/o.  So that looks good.
> 
> While this has cleared out a huge amount of complexity, I can't stop
> wondering if this could be done with a wee bit more "normal" operations.
> I'm pretty sure I couldn't have come up with this by myself, and I'm a
> bit worried that I wouldn't be able to find a race in it if one reared
> its ugly head.  

It could be done with a seqcounter I think, but that adds more branches,
variables, and barriers to this fastpath. Perhaps I should simply add
a bit more documentation.

 
> Is there a real good reason to allocate the percpu counters dynamically?
> Might as well stick them in the vfsmount and let the one
> kmem_cache_zalloc() in alloc_vfsmnt() do a bit larger of an allocation.
> Did you think that was going to bloat it to a compound allocation or
> something?  I hate the #ifdefs. :)

Distros want to ship big NR_CPUS kernels and have them run reasonably on
small num_possible_cpus() systems. But also, it would help to avoid
cacheline bouncing from false sharing (allocpercpu.c code can also mess
this bug for small objects like these counters, but that's a problem
with the allocpercpu code which should be fixed anyway).


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-22  4:35       ` Nick Piggin
@ 2008-12-29 23:00         ` Dave Hansen
  2008-12-30  4:02           ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2008-12-29 23:00 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List, linux-fsdevel

On Mon, 2008-12-22 at 05:35 +0100, Nick Piggin wrote:
> > Is there a real good reason to allocate the percpu counters dynamically?
> > Might as well stick them in the vfsmount and let the one
> > kmem_cache_zalloc() in alloc_vfsmnt() do a bit larger of an allocation.
> > Did you think that was going to bloat it to a compound allocation or
> > something?  I hate the #ifdefs. :)
> 
> Distros want to ship big NR_CPUS kernels and have them run reasonably on
> small num_possible_cpus() systems. But also, it would help to avoid
> cacheline bouncing from false sharing (allocpercpu.c code can also mess
> this bug for small objects like these counters, but that's a problem
> with the allocpercpu code which should be fixed anyway).

I guess we could also play the old trick:

struct vfsmount
{
	...
	int mnt_writers[0];
};

And just 

void __init mnt_init(void)
{
...
	int size = sizeof(struct vfsmount) + num_possible_cpus() * sizeof(int)

-       mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
+       mnt_cache = kmem_cache_create("mnt_cache", size,
                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

That should save us the dereference from the pointer and still let it be
pretty flexible.  

-- Dave


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc][patch 1/2] mnt_want_write speedup 1
  2008-12-29 23:00         ` Dave Hansen
@ 2008-12-30  4:02           ` Nick Piggin
  0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2008-12-30  4:02 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Linux Memory Management List, linux-fsdevel

On Mon, Dec 29, 2008 at 03:00:37PM -0800, Dave Hansen wrote:
> On Mon, 2008-12-22 at 05:35 +0100, Nick Piggin wrote:
> > > Is there a real good reason to allocate the percpu counters dynamically?
> > > Might as well stick them in the vfsmount and let the one
> > > kmem_cache_zalloc() in alloc_vfsmnt() do a bit larger of an allocation.
> > > Did you think that was going to bloat it to a compound allocation or
> > > something?  I hate the #ifdefs. :)
> > 
> > Distros want to ship big NR_CPUS kernels and have them run reasonably on
> > small num_possible_cpus() systems. But also, it would help to avoid
> > cacheline bouncing from false sharing (allocpercpu.c code can also mess
> > this bug for small objects like these counters, but that's a problem
> > with the allocpercpu code which should be fixed anyway).
> 
> I guess we could also play the old trick:
> 
> struct vfsmount
> {
> 	...
> 	int mnt_writers[0];
> };
> 
> And just 
> 
> void __init mnt_init(void)
> {
> ...
> 	int size = sizeof(struct vfsmount) + num_possible_cpus() * sizeof(int)
> 
> -       mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
> +       mnt_cache = kmem_cache_create("mnt_cache", size,
>                         0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
> 
> That should save us the dereference from the pointer and still let it be
> pretty flexible.  

Still results in cacheline contention, however...


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-12-30  4:02 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-19  6:19 [rfc][patch 1/2] mnt_want_write speedup 1 Nick Piggin
2008-12-19  6:20 ` [rfc][patch 2/2] mnt_want_write speedup 2 Nick Piggin
2008-12-19  6:34 ` [rfc][patch 1/2] mnt_want_write speedup 1 Dave Hansen
2008-12-19  6:52   ` Nick Piggin
2008-12-19  6:56     ` Nick Piggin
2008-12-19  6:54 ` Dave Hansen
2008-12-19  7:03   ` Nick Piggin
2008-12-19 15:32     ` Dave Hansen
2008-12-22  4:35       ` Nick Piggin
2008-12-29 23:00         ` Dave Hansen
2008-12-30  4:02           ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).