All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dave Hansen <haveblue@us.ibm.com>
To: akpm@osdl.org
Cc: linux-fsdevel@vger.kernel.org, hch@infradead.org,
	viro@ftp.linux.org.uk, Dave Hansen <haveblue@us.ibm.com>
Subject: [PATCH 25/26] r/o bind mounts: scalable writer count
Date: Fri, 22 Jun 2007 13:03:36 -0700	[thread overview]
Message-ID: <20070622200336.4C7DB006@kernel> (raw)
In-Reply-To: <20070622200303.82D9CC3A@kernel>


This uses a statically-allocated percpu variable.  All
operations are local to a cpu as long that cpu operates on
the same mount, and there are no writer count imbalances.
Writer count imbalances happen when a write is taken on one
cpu, and released on another, like when an open/close pair
is performed on two different cpus because the task moved.

I've written a little benchmark to sit in a loop for a couple
of seconds in several cpus in parallel doing open/write/close
loops.

http://sr71.net/~dave/linux/openbench.c

The code in here is a a worst-possible case for this patch.
It does opens on a _pair_ of files in two different mounts
in parallel.  This should cause my code to lose its "operate
on the same mount" optimization completely.  This worst-case
scenario causes a 3% degredation in the benchmark.

I could probably get rid of even this 3%, but it would be
more complex than what I have here, and I think this is
getting into acceptable territory.  In practice, I expect
writing more than 3 bytes to a file, as well as disk I/O
to mask any effects that this has.

---

A little history:

These patches used to use a single atomic_t in each vfsmount
to track the number of writers to the mount at any given time.
Open a file for write anywhere on the mount, and you increment
the atomic_t.

This didn't scale on NUMA or SMP.  It caused something like a
4x slowdown in open/write/close operations.  It bounced
cachelines around like mad.  I tried out a new system with a
per-node spinlock and atomic in each vfsmount to replace the
old single atomic.  It worked _much_ better on NUMA, but still
caused a ~6% regression on the same open/write/close operation
set on a normal 4-way SMP machine, because it was still
contended inside of a node.

We could generalize this lock, but this is an awfully specialized
situation, and I'd be worried that people would try to use it
when it isn't absolutely necessary.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/fs/namei.c            |    6 +
 lxc-dave/fs/namespace.c        |  140 ++++++++++++++++++++++++++++++++++++++++-
 lxc-dave/include/linux/mount.h |    9 ++
 3 files changed, 150 insertions(+), 5 deletions(-)

diff -puN fs/namei.c~numa_mnt_want_write fs/namei.c
--- lxc/fs/namei.c~numa_mnt_want_write	2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/fs/namei.c	2007-06-22 10:12:46.000000000 -0700
@@ -231,10 +231,12 @@ int permission(struct inode *inode, int 
 	int retval, submask;
 
 	if (mask & MAY_WRITE) {
-
 		/*
-		 * Nobody gets write access to a read-only fs.
+		 * If this WARN_ON() is hit, it likely means that
+		 * there was a missed mnt_want_write() on the path
+		 * leading here.
 		 */
+		WARN_ON(__mnt_is_readonly(nd->mnt));
 		if (IS_RDONLY(inode) &&
 		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 			return -EROFS;
diff -puN fs/namespace.c~numa_mnt_want_write fs/namespace.c
--- lxc/fs/namespace.c~numa_mnt_want_write	2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/fs/namespace.c	2007-06-22 10:21:39.000000000 -0700
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/sysfs.h>
 #include <linux/seq_file.h>
@@ -51,6 +52,8 @@ static inline unsigned long hash(struct 
 	return tmp & hash_mask;
 }
 
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -64,6 +67,7 @@ struct vfsmount *alloc_vfsmnt(const char
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
+		atomic_set(&mnt->__mnt_writers, 0);
 		if (name) {
 			int size = strlen(name) + 1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -76,19 +80,149 @@ struct vfsmount *alloc_vfsmnt(const char
 	return mnt;
 }
 
-int mnt_want_write(struct vfsmount *mnt)
+
+struct mnt_writer {
+	/*
+	 * If holding multiple instances of this lock, they
+	 * must be ordered by cpu number.
+	 */
+	spinlock_t lock;
+	unsigned long count;
+	struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
 {
-	if (__mnt_is_readonly(mnt))
-		return -EROFS;
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+		spin_lock_init(&writer->lock);
+		writer->count = 0;
+	}
 	return 0;
 }
+fs_initcall(init_mnt_writers);
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+	if (!cpu_writer->mnt)
+		return;
+	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+	cpu_writer->count = 0;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+	int ret = 0;
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+	if (__mnt_is_readonly(mnt)) {
+		ret = -EROFS;
+		goto out;
+	}
+	if (cpu_writer->mnt != mnt) {
+		__clear_mnt_count(cpu_writer);
+		cpu_writer->mnt = mnt;
+	}
+	cpu_writer->count++;
+out:
+	spin_unlock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
+	return ret;
+}
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
+static void lock_and_coalesce_cpu_mnt_writer_counts(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_lock(&cpu_writer->lock);
+		__clear_mnt_count(cpu_writer);
+		cpu_writer->mnt = NULL;
+	}
+}
+
+static void mnt_unlock_cpus(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_unlock(&cpu_writer->lock);
+	}
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+	/*
+	 * This should be fast the vast majority
+	 * of the time because everyone will only
+	 * be reading it and will share cachelines.
+	 */
+	while (atomic_read(&mnt->__mnt_writers) <
+		MNT_WRITER_UNDERFLOW_LIMIT) {
+		/*
+		 * It isn't necessary to hold all of the locks
+		 * at the same time, but doing it this way makes
+		 * us share a lot more code.
+		 */
+		lock_and_coalesce_cpu_mnt_writer_counts();
+		mnt_unlock_cpus();
+	}
+}
+
 void mnt_drop_write(struct vfsmount *mnt)
 {
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
+	if (cpu_writer->count > 0)
+		cpu_writer->count--;
+	else
+		atomic_dec(&mnt->__mnt_writers);
+	spin_unlock(&cpu_writer->lock);
+	handle_write_count_underflow(mnt);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	lock_and_coalesce_cpu_mnt_writer_counts();
+	/*
+	 * With all the locks held, this value is stable
+	 */
+	if (atomic_read(&mnt->__mnt_writers) > 0) {
+		ret = -EBUSY;
+		goto out;
+	}
+	/*
+	 * actually set mount's r/o flag here to make
+	 * __mnt_is_readonly() true, which keeps anyone
+	 * from doing a successful mnt_want_write().
+	 */
+out:
+	mnt_unlock_cpus();
+	return ret;
+}
+
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
diff -puN include/linux/mount.h~numa_mnt_want_write include/linux/mount.h
--- lxc/include/linux/mount.h~numa_mnt_want_write	2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/include/linux/mount.h	2007-06-22 10:21:39.000000000 -0700
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
@@ -61,6 +62,14 @@ struct vfsmount {
 	atomic_t mnt_count;
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
+
+	/*
+	 * This value is not stable unless all of the
+	 * mnt_writers[] spinlocks are held, and all
+	 * mnt_writer[]s on this mount have 0 as
+	 * their ->count
+	 */
+	atomic_t __mnt_writers;
 };
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
_

  parent reply	other threads:[~2007-06-22 20:03 UTC|newest]

Thread overview: 83+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-06-22 20:03 [PATCH 00/26] Mount writer count and read-only bind mounts Dave Hansen
2007-06-22 20:03 ` [PATCH 01/26] document nlink function Dave Hansen
2007-06-23  7:36   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 02/26] ext3: remove extra IS_RDONLY() check Dave Hansen
2007-06-23  7:36   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 03/26] ext4: " Dave Hansen
2007-06-23  7:37   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 04/26] filesystem helpers for custom 'struct file's Dave Hansen
2007-06-23  7:38   ` Christoph Hellwig
2007-06-25 14:53     ` Dave Hansen
2007-06-23 16:52   ` Andrew Morton
2007-06-25 15:37     ` Dave Hansen
2007-06-25 17:25       ` Andrew Morton
2007-06-25 17:32         ` Dave Hansen
2007-06-30  9:35           ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 05/26] r/o bind mounts: stub functions Dave Hansen
2007-06-23  7:39   ` Christoph Hellwig
2007-06-23 16:52   ` Andrew Morton
2007-06-25 15:49     ` Dave Hansen
2007-06-22 20:03 ` [PATCH 06/26] elevate write count open()'d files Dave Hansen
2007-06-23  7:40   ` Christoph Hellwig
2007-06-25 15:03     ` Dave Hansen
2007-06-22 20:03 ` [PATCH 07/26] r/o bind mounts: elevate write count for some ioctls Dave Hansen
2007-06-23  7:42   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 08/26] elevate writer count for chown and friends Dave Hansen
2007-06-23  7:43   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 09/26] make access() use mnt check Dave Hansen
2007-06-23  7:45   ` Christoph Hellwig
2007-06-25 18:27     ` Dave Hansen
2007-06-26 19:04       ` Dave Kleikamp
2007-06-30  9:37       ` Christoph Hellwig
2007-07-02 16:09         ` Dave Hansen
2007-06-22 20:03 ` [PATCH 10/26] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
2007-06-23  7:45   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 11/26] elevate write count during entire ncp_ioctl() Dave Hansen
2007-06-22 20:03 ` [PATCH 12/26] elevate write count for link and symlink calls Dave Hansen
2007-06-22 20:03 ` [PATCH 13/26] elevate mount count for extended attributes Dave Hansen
2007-06-22 20:03 ` [PATCH 14/26] elevate write count for file_update_time() Dave Hansen
2007-06-23  7:46   ` Christoph Hellwig
2007-06-25 18:32     ` Dave Hansen
2007-06-30  9:38       ` Christoph Hellwig
2007-07-06 19:17     ` Dave Hansen
2007-06-22 20:03 ` [PATCH 15/26] mount_is_safe(): add comment Dave Hansen
2007-06-23  7:47   ` Christoph Hellwig
2007-06-25 15:10     ` Dave Hansen
2007-06-22 20:03 ` [PATCH 16/26] unix_find_other() elevate write count for touch_atime() Dave Hansen
2007-06-23  7:47   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 17/26] elevate write count over calls to vfs_rename() Dave Hansen
2007-06-23  7:49   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 18/26] nfs: check mnt instead of superblock directly Dave Hansen
2007-06-23  7:49   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 19/26] elevate writer count for do_sys_truncate() Dave Hansen
2007-06-23  7:49   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 20/26] elevate write count for do_utimes() Dave Hansen
2007-06-23  7:49   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 21/26] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
2007-06-23  7:50   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 22/26] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
2007-06-23  7:51   ` Christoph Hellwig
2007-06-25 15:19     ` Dave Hansen
2007-06-30  9:39       ` Christoph Hellwig
2007-07-02 23:31         ` Dave Hansen
2007-07-05 22:43         ` Dave Hansen
2007-07-07 18:25           ` Jan Engelhardt
2007-07-09 19:04             ` Dave Hansen
2007-07-11 10:22             ` Christoph Hellwig
2007-07-11 10:22           ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 23/26] elevate mnt writers for vfs_unlink() callers Dave Hansen
2007-06-23  7:51   ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 24/26] do_rmdir(): elevate write count Dave Hansen
2007-06-23  7:51   ` Christoph Hellwig
2007-06-22 20:03 ` Dave Hansen [this message]
2007-06-23 11:28   ` [PATCH 25/26] r/o bind mounts: scalable writer count Miklos Szeredi
2007-06-23 11:31     ` Miklos Szeredi
2007-06-25 15:36     ` Dave Hansen
2007-06-25 19:09       ` Miklos Szeredi
2007-06-23 16:52   ` Andrew Morton
2007-06-25 15:47     ` Dave Hansen
2007-06-22 20:03 ` [PATCH 26/26] honor r/w changes at do_remount() time Dave Hansen
2007-06-23  7:51   ` Christoph Hellwig
2007-06-23 16:52 ` [PATCH 00/26] Mount writer count and read-only bind mounts Andrew Morton
2007-06-25 15:45   ` Dave Hansen
2007-06-30  9:57   ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070622200336.4C7DB006@kernel \
    --to=haveblue@us.ibm.com \
    --cc=akpm@osdl.org \
    --cc=hch@infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@ftp.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.