From: Nick Piggin <npiggin@suse.de>
To: Al Viro <viro@ZenIV.linux.org.uk>,
Frank Mayhar <fmayhar@google.com>,
John Stultz <johnstul@us.ibm.com>,
Andi Kleen <ak@linux.intel.com>,
linux-fsdevel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@suse.de>,
Alan Cox <alan@lxorguk.ukuu.org.uk>,
"Eric W. Biederman" <ebiederm@xmission.com>,
Linus Torvalds <torvalds@linux-foundation.org>
Subject: [patch 2/2] fs: scale files_lock
Date: Tue, 16 Mar 2010 20:46:30 +1100 [thread overview]
Message-ID: <20100316094630.GN2869@laptop> (raw)
In-Reply-To: <20100316094423.GM2869@laptop>
fs: scale files_lock
Improve scalability of files_lock by adding per-cpu, per-sb files lists,
protected with per-cpu locking. Effectively turning it into a big-writer lock.
One difficulty with this approach is that a file can be removed from the list
by another CPU. We must track which per-cpu list the file is on. Scalability
could suffer if files are frequently removed from different cpu's list.
However loads with frequent removal of files imply short interval between
adding and removing the files, and the scheduler attempts to avoid moving
processes too far away. Also, even in the case of cross-CPU removal, the
hardware has much more opportunity to parallelise cacheline transfers with N
cachelines than with 1.
A worst-case test of 1 CPU allocating files subsequently being freed by N CPUs
degenerates to contending on a single lock, which is no worse than before. When
more than one CPU are allocating files, even if they are always freed by
different CPUs, there will be more parallelism than the single-lock case.
Testing results on a 2 socket, 8 core opteron, I measure the number of times
the lock is taken to remove the file, the number of times it is removed by the
same CPU that added it, and the number of times it is removed by the same node
that added it.
Booting:
locks=25049 cpu-hits=23174 (92.5%) node-hits=23945 (95.6%)
kbuild -j16
locks=2281913 cpu-hits=2208126 (96.8%) node-hits=2252674 (98.7%)
dbench 64
locks=4306582 cpu-hits=4287247 (99.6%) node-hits=4299527 (99.8%)
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
fs/file_table.c | 155 ++++++++++++++++++++++++++++++++++++++---------------
fs/super.c | 18 ++++++
include/linux/fs.h | 7 ++
3 files changed, 139 insertions(+), 41 deletions(-)
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c
+++ linux-2.6/fs/file_table.c
@@ -21,6 +21,7 @@
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
#include <linux/ima.h>
#include <asm/atomic.h>
@@ -32,7 +33,7 @@ struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+static DEFINE_PER_CPU(spinlock_t, files_cpulock);
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;
@@ -330,42 +331,101 @@ void put_filp(struct file *file)
void file_sb_list_add(struct file *file, struct super_block *sb)
{
- spin_lock(&files_lock);
+ spinlock_t *lock;
+ struct list_head *list;
+#ifdef CONFIG_SMP
+ int cpu;
+#endif
+
+ lock = &get_cpu_var(files_cpulock);
+#ifdef CONFIG_SMP
+ cpu = smp_processor_id();
+ list = per_cpu_ptr(sb->s_files, cpu);
+ file->f_sb_list_cpu = cpu;
+#else
+ list = &sb->s_files;
+#endif
+ spin_lock(lock);
BUG_ON(!list_empty(&file->f_u.fu_list));
- list_add(&file->f_u.fu_list, &sb->s_files);
- spin_unlock(&files_lock);
+ list_add(&file->f_u.fu_list, list);
+ spin_unlock(lock);
+ put_cpu_var(files_cpulock);
}
void file_sb_list_del(struct file *file)
{
if (!list_empty(&file->f_u.fu_list)) {
- spin_lock(&files_lock);
+ spinlock_t *lock;
+
+#ifdef CONFIG_SMP
+ lock = &per_cpu(files_cpulock, file->f_sb_list_cpu);
+#else
+ lock = &__get_cpu_var(files_cpulock);
+#endif
+ spin_lock(lock);
list_del_init(&file->f_u.fu_list);
- spin_unlock(&files_lock);
+ spin_unlock(lock);
+ }
+}
+
+static void file_list_lock_all(void)
+{
+ int i;
+ int nr = 0;
+
+ for_each_possible_cpu(i) {
+ spinlock_t *lock;
+
+ lock = &per_cpu(files_cpulock, i);
+ spin_lock_nested(lock, nr);
+ nr++;
+ }
+}
+
+static void file_list_unlock_all(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ spinlock_t *lock;
+
+ lock = &per_cpu(files_cpulock, i);
+ spin_unlock(lock);
}
}
int fs_may_remount_ro(struct super_block *sb)
{
- struct file *file;
+ int i;
/* Check that no files are currently opened for writing. */
- spin_lock(&files_lock);
- list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
- struct inode *inode = file->f_path.dentry->d_inode;
-
- /* File with pending delete? */
- if (inode->i_nlink == 0)
- goto too_bad;
-
- /* Writeable file? */
- if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
- goto too_bad;
+ file_list_lock_all();
+ for_each_possible_cpu(i) {
+ struct file *file;
+ struct list_head *list;
+
+#ifdef CONFIG_SMP
+ list = per_cpu_ptr(sb->s_files, i);
+#else
+ list = &sb->s_files;
+#endif
+ list_for_each_entry(file, list, f_u.fu_list) {
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ /* File with pending delete? */
+ if (inode->i_nlink == 0)
+ goto too_bad;
+
+ /* Writeable file? */
+ if (S_ISREG(inode->i_mode) &&
+ (file->f_mode & FMODE_WRITE))
+ goto too_bad;
+ }
}
- spin_unlock(&files_lock);
+ file_list_unlock_all();
return 1; /* Tis' cool bro. */
too_bad:
- spin_unlock(&files_lock);
+ file_list_unlock_all();
return 0;
}
@@ -378,37 +438,48 @@ too_bad:
*/
void mark_files_ro(struct super_block *sb)
{
- struct file *f;
+ int i;
retry:
- spin_lock(&files_lock);
- list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
- struct vfsmount *mnt;
- if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
- continue;
- if (!file_count(f))
- continue;
- if (!(f->f_mode & FMODE_WRITE))
- continue;
- spin_lock(&f->f_lock);
- f->f_mode &= ~FMODE_WRITE;
- spin_unlock(&f->f_lock);
- if (file_check_writeable(f) != 0)
- continue;
- file_release_write(f);
- mnt = mntget(f->f_path.mnt);
- /* This can sleep, so we can't hold the spinlock. */
- spin_unlock(&files_lock);
- mnt_drop_write(mnt);
- mntput(mnt);
- goto retry;
+ file_list_lock_all();
+ for_each_possible_cpu(i) {
+ struct file *f;
+ struct list_head *list;
+
+#ifdef CONFIG_SMP
+ list = per_cpu_ptr(sb->s_files, i);
+#else
+ list = &sb->s_files;
+#endif
+ list_for_each_entry(f, list, f_u.fu_list) {
+ struct vfsmount *mnt;
+ if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+ continue;
+ if (!file_count(f))
+ continue;
+ if (!(f->f_mode & FMODE_WRITE))
+ continue;
+ spin_lock(&f->f_lock);
+ f->f_mode &= ~FMODE_WRITE;
+ spin_unlock(&f->f_lock);
+ if (file_check_writeable(f) != 0)
+ continue;
+ file_release_write(f);
+ mnt = mntget(f->f_path.mnt);
+ /* This can sleep, so we can't hold the spinlock. */
+ file_list_unlock_all();
+ mnt_drop_write(mnt);
+ mntput(mnt);
+ goto retry;
+ }
}
- spin_unlock(&files_lock);
+ file_list_unlock_all();
}
void __init files_init(unsigned long mempages)
{
int n;
+ int i;
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -423,5 +494,7 @@ void __init files_init(unsigned long mem
if (files_stat.max_files < NR_FILE)
files_stat.max_files = NR_FILE;
files_defer_init();
+ for_each_possible_cpu(i)
+ spin_lock_init(&per_cpu(files_cpulock, i));
percpu_counter_init(&nr_files, 0);
}
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c
+++ linux-2.6/fs/super.c
@@ -62,7 +62,22 @@ static struct super_block *alloc_super(s
s = NULL;
goto out;
}
+#ifdef CONFIG_SMP
+ s->s_files = alloc_percpu(struct list_head);
+ if (!s->s_files) {
+ security_sb_free(s);
+ kfree(s);
+ s = NULL;
+ goto out;
+ } else {
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+ }
+#else
INIT_LIST_HEAD(&s->s_files);
+#endif
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -117,6 +132,9 @@ out:
*/
static inline void destroy_super(struct super_block *s)
{
+#ifdef CONFIG_SMP
+ free_percpu(s->s_files);
+#endif
security_sb_free(s);
kfree(s->s_subtype);
kfree(s->s_options);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -924,6 +924,9 @@ struct file {
#define f_vfsmnt f_path.mnt
const struct file_operations *f_op;
spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */
+#ifdef CONFIG_SMP
+ int f_sb_list_cpu;
+#endif
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
@@ -1340,7 +1343,11 @@ struct super_block {
struct list_head s_inodes; /* all inodes */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
+#ifdef CONFIG_SMP
+ struct list_head *s_files;
+#else
struct list_head s_files;
+#endif
/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
struct list_head s_dentry_lru; /* unused dentry lru */
int s_nr_dentry_unused; /* # of dentry on lru */
next prev parent reply other threads:[~2010-03-16 9:46 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-03-16 9:44 [patch 1/2] fs: cleanup files_lock Nick Piggin
2010-03-16 9:46 ` Nick Piggin [this message]
2010-03-16 14:41 ` Andi Kleen
2010-03-17 14:16 ` Greg KH
2010-03-17 14:38 ` Nick Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100316094630.GN2869@laptop \
--to=npiggin@suse.de \
--cc=ak@linux.intel.com \
--cc=alan@lxorguk.ukuu.org.uk \
--cc=ebiederm@xmission.com \
--cc=fmayhar@google.com \
--cc=gregkh@suse.de \
--cc=johnstul@us.ibm.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=viro@ZenIV.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).