From: Dave Hansen <haveblue@us.ibm.com>
To: akpm@osdl.org
Cc: linux-fsdevel@vger.kernel.org, hch@infradead.org,
viro@ftp.linux.org.uk, Dave Hansen <haveblue@us.ibm.com>
Subject: [PATCH 25/26] r/o bind mounts: scalable writer count
Date: Fri, 22 Jun 2007 13:03:36 -0700 [thread overview]
Message-ID: <20070622200336.4C7DB006@kernel> (raw)
In-Reply-To: <20070622200303.82D9CC3A@kernel>
This uses a statically-allocated percpu variable. All
operations are local to a cpu as long that cpu operates on
the same mount, and there are no writer count imbalances.
Writer count imbalances happen when a write is taken on one
cpu, and released on another, like when an open/close pair
is performed on two different cpus because the task moved.
I've written a little benchmark to sit in a loop for a couple
of seconds in several cpus in parallel doing open/write/close
loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch.
It does opens on a _pair_ of files in two different mounts
in parallel. This should cause my code to lose its "operate
on the same mount" optimization completely. This worst-case
scenario causes a 3% degredation in the benchmark.
I could probably get rid of even this 3%, but it would be
more complex than what I have here, and I think this is
getting into acceptable territory. In practice, I expect
writing more than 3 bytes to a file, as well as disk I/O
to mask any effects that this has.
---
A little history:
These patches used to use a single atomic_t in each vfsmount
to track the number of writers to the mount at any given time.
Open a file for write anywhere on the mount, and you increment
the atomic_t.
This didn't scale on NUMA or SMP. It caused something like a
4x slowdown in open/write/close operations. It bounced
cachelines around like mad. I tried out a new system with a
per-node spinlock and atomic in each vfsmount to replace the
old single atomic. It worked _much_ better on NUMA, but still
caused a ~6% regression on the same open/write/close operation
set on a normal 4-way SMP machine, because it was still
contended inside of a node.
We could generalize this lock, but this is an awfully specialized
situation, and I'd be worried that people would try to use it
when it isn't absolutely necessary.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 6 +
lxc-dave/fs/namespace.c | 140 ++++++++++++++++++++++++++++++++++++++++-
lxc-dave/include/linux/mount.h | 9 ++
3 files changed, 150 insertions(+), 5 deletions(-)
diff -puN fs/namei.c~numa_mnt_want_write fs/namei.c
--- lxc/fs/namei.c~numa_mnt_want_write 2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-06-22 10:12:46.000000000 -0700
@@ -231,10 +231,12 @@ int permission(struct inode *inode, int
int retval, submask;
if (mask & MAY_WRITE) {
-
/*
- * Nobody gets write access to a read-only fs.
+ * If this WARN_ON() is hit, it likely means that
+ * there was a missed mnt_want_write() on the path
+ * leading here.
*/
+ WARN_ON(__mnt_is_readonly(nd->mnt));
if (IS_RDONLY(inode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
return -EROFS;
diff -puN fs/namespace.c~numa_mnt_want_write fs/namespace.c
--- lxc/fs/namespace.c~numa_mnt_want_write 2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/fs/namespace.c 2007-06-22 10:21:39.000000000 -0700
@@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
+#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
@@ -51,6 +52,8 @@ static inline unsigned long hash(struct
return tmp & hash_mask;
}
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -64,6 +67,7 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
+ atomic_set(&mnt->__mnt_writers, 0);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -76,19 +80,149 @@ struct vfsmount *alloc_vfsmnt(const char
return mnt;
}
-int mnt_want_write(struct vfsmount *mnt)
+
+struct mnt_writer {
+ /*
+ * If holding multiple instances of this lock, they
+ * must be ordered by cpu number.
+ */
+ spinlock_t lock;
+ unsigned long count;
+ struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
{
- if (__mnt_is_readonly(mnt))
- return -EROFS;
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+ spin_lock_init(&writer->lock);
+ writer->count = 0;
+ }
return 0;
}
+fs_initcall(init_mnt_writers);
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+ if (!cpu_writer->mnt)
+ return;
+ atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+ cpu_writer->count = 0;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+ int ret = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+ if (__mnt_is_readonly(mnt)) {
+ ret = -EROFS;
+ goto out;
+ }
+ if (cpu_writer->mnt != mnt) {
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = mnt;
+ }
+ cpu_writer->count++;
+out:
+ spin_unlock(&cpu_writer->lock);
+ put_cpu_var(mnt_writers);
+ return ret;
+}
EXPORT_SYMBOL_GPL(mnt_want_write);
+static void lock_and_coalesce_cpu_mnt_writer_counts(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_lock(&cpu_writer->lock);
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = NULL;
+ }
+}
+
+static void mnt_unlock_cpus(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_unlock(&cpu_writer->lock);
+ }
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count. Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+ /*
+ * This should be fast the vast majority
+ * of the time because everyone will only
+ * be reading it and will share cachelines.
+ */
+ while (atomic_read(&mnt->__mnt_writers) <
+ MNT_WRITER_UNDERFLOW_LIMIT) {
+ /*
+ * It isn't necessary to hold all of the locks
+ * at the same time, but doing it this way makes
+ * us share a lot more code.
+ */
+ lock_and_coalesce_cpu_mnt_writer_counts();
+ mnt_unlock_cpus();
+ }
+}
+
void mnt_drop_write(struct vfsmount *mnt)
{
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+ put_cpu_var(mnt_writers);
+ if (cpu_writer->count > 0)
+ cpu_writer->count--;
+ else
+ atomic_dec(&mnt->__mnt_writers);
+ spin_unlock(&cpu_writer->lock);
+ handle_write_count_underflow(mnt);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ lock_and_coalesce_cpu_mnt_writer_counts();
+ /*
+ * With all the locks held, this value is stable
+ */
+ if (atomic_read(&mnt->__mnt_writers) > 0) {
+ ret = -EBUSY;
+ goto out;
+ }
+ /*
+ * actually set mount's r/o flag here to make
+ * __mnt_is_readonly() true, which keeps anyone
+ * from doing a successful mnt_want_write().
+ */
+out:
+ mnt_unlock_cpus();
+ return ret;
+}
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
diff -puN include/linux/mount.h~numa_mnt_want_write include/linux/mount.h
--- lxc/include/linux/mount.h~numa_mnt_want_write 2007-06-22 10:12:46.000000000 -0700
+++ lxc-dave/include/linux/mount.h 2007-06-22 10:21:39.000000000 -0700
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>
@@ -61,6 +62,14 @@ struct vfsmount {
atomic_t mnt_count;
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
+
+ /*
+ * This value is not stable unless all of the
+ * mnt_writers[] spinlocks are held, and all
+ * mnt_writer[]s on this mount have 0 as
+ * their ->count
+ */
+ atomic_t __mnt_writers;
};
static inline struct vfsmount *mntget(struct vfsmount *mnt)
_
next prev parent reply other threads:[~2007-06-22 20:03 UTC|newest]
Thread overview: 83+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-22 20:03 [PATCH 00/26] Mount writer count and read-only bind mounts Dave Hansen
2007-06-22 20:03 ` [PATCH 01/26] document nlink function Dave Hansen
2007-06-23 7:36 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 02/26] ext3: remove extra IS_RDONLY() check Dave Hansen
2007-06-23 7:36 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 03/26] ext4: " Dave Hansen
2007-06-23 7:37 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 04/26] filesystem helpers for custom 'struct file's Dave Hansen
2007-06-23 7:38 ` Christoph Hellwig
2007-06-25 14:53 ` Dave Hansen
2007-06-23 16:52 ` Andrew Morton
2007-06-25 15:37 ` Dave Hansen
2007-06-25 17:25 ` Andrew Morton
2007-06-25 17:32 ` Dave Hansen
2007-06-30 9:35 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 05/26] r/o bind mounts: stub functions Dave Hansen
2007-06-23 7:39 ` Christoph Hellwig
2007-06-23 16:52 ` Andrew Morton
2007-06-25 15:49 ` Dave Hansen
2007-06-22 20:03 ` [PATCH 06/26] elevate write count open()'d files Dave Hansen
2007-06-23 7:40 ` Christoph Hellwig
2007-06-25 15:03 ` Dave Hansen
2007-06-22 20:03 ` [PATCH 07/26] r/o bind mounts: elevate write count for some ioctls Dave Hansen
2007-06-23 7:42 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 08/26] elevate writer count for chown and friends Dave Hansen
2007-06-23 7:43 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 09/26] make access() use mnt check Dave Hansen
2007-06-23 7:45 ` Christoph Hellwig
2007-06-25 18:27 ` Dave Hansen
2007-06-26 19:04 ` Dave Kleikamp
2007-06-30 9:37 ` Christoph Hellwig
2007-07-02 16:09 ` Dave Hansen
2007-06-22 20:03 ` [PATCH 10/26] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
2007-06-23 7:45 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 11/26] elevate write count during entire ncp_ioctl() Dave Hansen
2007-06-22 20:03 ` [PATCH 12/26] elevate write count for link and symlink calls Dave Hansen
2007-06-22 20:03 ` [PATCH 13/26] elevate mount count for extended attributes Dave Hansen
2007-06-22 20:03 ` [PATCH 14/26] elevate write count for file_update_time() Dave Hansen
2007-06-23 7:46 ` Christoph Hellwig
2007-06-25 18:32 ` Dave Hansen
2007-06-30 9:38 ` Christoph Hellwig
2007-07-06 19:17 ` Dave Hansen
2007-06-22 20:03 ` [PATCH 15/26] mount_is_safe(): add comment Dave Hansen
2007-06-23 7:47 ` Christoph Hellwig
2007-06-25 15:10 ` Dave Hansen
2007-06-22 20:03 ` [PATCH 16/26] unix_find_other() elevate write count for touch_atime() Dave Hansen
2007-06-23 7:47 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 17/26] elevate write count over calls to vfs_rename() Dave Hansen
2007-06-23 7:49 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 18/26] nfs: check mnt instead of superblock directly Dave Hansen
2007-06-23 7:49 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 19/26] elevate writer count for do_sys_truncate() Dave Hansen
2007-06-23 7:49 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 20/26] elevate write count for do_utimes() Dave Hansen
2007-06-23 7:49 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 21/26] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
2007-06-23 7:50 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 22/26] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
2007-06-23 7:51 ` Christoph Hellwig
2007-06-25 15:19 ` Dave Hansen
2007-06-30 9:39 ` Christoph Hellwig
2007-07-02 23:31 ` Dave Hansen
2007-07-05 22:43 ` Dave Hansen
2007-07-07 18:25 ` Jan Engelhardt
2007-07-09 19:04 ` Dave Hansen
2007-07-11 10:22 ` Christoph Hellwig
2007-07-11 10:22 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 23/26] elevate mnt writers for vfs_unlink() callers Dave Hansen
2007-06-23 7:51 ` Christoph Hellwig
2007-06-22 20:03 ` [PATCH 24/26] do_rmdir(): elevate write count Dave Hansen
2007-06-23 7:51 ` Christoph Hellwig
2007-06-22 20:03 ` Dave Hansen [this message]
2007-06-23 11:28 ` [PATCH 25/26] r/o bind mounts: scalable writer count Miklos Szeredi
2007-06-23 11:31 ` Miklos Szeredi
2007-06-25 15:36 ` Dave Hansen
2007-06-25 19:09 ` Miklos Szeredi
2007-06-23 16:52 ` Andrew Morton
2007-06-25 15:47 ` Dave Hansen
2007-06-22 20:03 ` [PATCH 26/26] honor r/w changes at do_remount() time Dave Hansen
2007-06-23 7:51 ` Christoph Hellwig
2007-06-23 16:52 ` [PATCH 00/26] Mount writer count and read-only bind mounts Andrew Morton
2007-06-25 15:45 ` Dave Hansen
2007-06-30 9:57 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070622200336.4C7DB006@kernel \
--to=haveblue@us.ibm.com \
--cc=akpm@osdl.org \
--cc=hch@infradead.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=viro@ftp.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).