[patch 2/2] fs: scale files_lock

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Nick Piggin <npiggin@suse.de>
To: Al Viro <viro@ZenIV.linux.org.uk>,
	Frank Mayhar <fmayhar@google.com>,
	John Stultz <johnstul@us.ibm.com>,
	Andi Kleen <ak@linux.intel.com>,
	linux-fsdevel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@suse.de>,
	Alan Cox <alan@lxorguk.ukuu.org.uk>,
	"Eric W. Biederman" <ebiederm@xmission.com>,
	Linus Torvalds <torvalds@linux-foundation.org>
Subject: [patch 2/2] fs: scale files_lock
Date: Tue, 16 Mar 2010 20:46:30 +1100	[thread overview]
Message-ID: <20100316094630.GN2869@laptop> (raw)
In-Reply-To: <20100316094423.GM2869@laptop>

fs: scale files_lock

Improve scalability of files_lock by adding per-cpu, per-sb files lists,
protected with per-cpu locking. Effectively turning it into a big-writer lock.

One difficulty with this approach is that a file can be removed from the list
by another CPU. We must track which per-cpu list the file is on.  Scalability
could suffer if files are frequently removed from different cpu's list.

However loads with frequent removal of files imply short interval between
adding and removing the files, and the scheduler attempts to avoid moving
processes too far away. Also, even in the case of cross-CPU removal, the
hardware has much more opportunity to parallelise cacheline transfers with N
cachelines than with 1.

A worst-case test of 1 CPU allocating files subsequently being freed by N CPUs
degenerates to contending on a single lock, which is no worse than before. When
more than one CPU are allocating files, even if they are always freed by
different CPUs, there will be more parallelism than the single-lock case.

Testing results on a 2 socket, 8 core opteron, I measure the number of times
the lock is taken to remove the file, the number of times it is removed by the
same CPU that added it, and the number of times it is removed by the same node
that added it.

Booting:
locks=25049 cpu-hits=23174 (92.5%) node-hits=23945 (95.6%)

kbuild -j16
locks=2281913 cpu-hits=2208126 (96.8%) node-hits=2252674 (98.7%)

dbench 64
locks=4306582 cpu-hits=4287247 (99.6%) node-hits=4299527 (99.8%)

Signed-off-by: Nick Piggin <npiggin@suse.de>
---
 fs/file_table.c    |  155 ++++++++++++++++++++++++++++++++++++++---------------
 fs/super.c         |   18 ++++++
 include/linux/fs.h |    7 ++
 3 files changed, 139 insertions(+), 41 deletions(-)

Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c
+++ linux-2.6/fs/file_table.c
@@ -21,6 +21,7 @@
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
 
 #include <asm/atomic.h>
@@ -32,7 +33,7 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+static DEFINE_PER_CPU(spinlock_t, files_cpulock);
 
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -330,42 +331,101 @@ void put_filp(struct file *file)
 
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	spin_lock(&files_lock);
+	spinlock_t *lock;
+	struct list_head *list;
+#ifdef CONFIG_SMP
+	int cpu;
+#endif
+
+	lock = &get_cpu_var(files_cpulock);
+#ifdef CONFIG_SMP
+	cpu = smp_processor_id();
+	list = per_cpu_ptr(sb->s_files, cpu);
+	file->f_sb_list_cpu = cpu;
+#else
+	list = &sb->s_files;
+#endif
+	spin_lock(lock);
 	BUG_ON(!list_empty(&file->f_u.fu_list));
-	list_add(&file->f_u.fu_list, &sb->s_files);
-	spin_unlock(&files_lock);
+	list_add(&file->f_u.fu_list, list);
+	spin_unlock(lock);
+	put_cpu_var(files_cpulock);
 }
 
 void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		spin_lock(&files_lock);
+		spinlock_t *lock;
+
+#ifdef CONFIG_SMP
+		lock = &per_cpu(files_cpulock, file->f_sb_list_cpu);
+#else
+		lock = &__get_cpu_var(files_cpulock);
+#endif
+		spin_lock(lock);
 		list_del_init(&file->f_u.fu_list);
-		spin_unlock(&files_lock);
+		spin_unlock(lock);
+	}
+}
+
+static void file_list_lock_all(void)
+{
+	int i;
+	int nr = 0;
+
+	for_each_possible_cpu(i) {
+		spinlock_t *lock;
+
+		lock = &per_cpu(files_cpulock, i);
+		spin_lock_nested(lock, nr);
+		nr++;
+	}
+}
+
+static void file_list_unlock_all(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		spinlock_t *lock;
+
+		lock = &per_cpu(files_cpulock, i);
+		spin_unlock(lock);
 	}
 }
 
 int fs_may_remount_ro(struct super_block *sb)
 {
-	struct file *file;
+	int i;
 
 	/* Check that no files are currently opened for writing. */
-	spin_lock(&files_lock);
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
-		struct inode *inode = file->f_path.dentry->d_inode;
-
-		/* File with pending delete? */
-		if (inode->i_nlink == 0)
-			goto too_bad;
-
-		/* Writeable file? */
-		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
-			goto too_bad;
+	file_list_lock_all();
+	for_each_possible_cpu(i) {
+		struct file *file;
+		struct list_head *list;
+
+#ifdef CONFIG_SMP
+		list = per_cpu_ptr(sb->s_files, i);
+#else
+		list = &sb->s_files;
+#endif
+		list_for_each_entry(file, list, f_u.fu_list) {
+			struct inode *inode = file->f_path.dentry->d_inode;
+
+			/* File with pending delete? */
+			if (inode->i_nlink == 0)
+				goto too_bad;
+
+			/* Writeable file? */
+			if (S_ISREG(inode->i_mode) &&
+					(file->f_mode & FMODE_WRITE))
+				goto too_bad;
+		}
 	}
-	spin_unlock(&files_lock);
+	file_list_unlock_all();
 	return 1; /* Tis' cool bro. */
 too_bad:
-	spin_unlock(&files_lock);
+	file_list_unlock_all();
 	return 0;
 }
 
@@ -378,37 +438,48 @@ too_bad:
  */
 void mark_files_ro(struct super_block *sb)
 {
-	struct file *f;
+	int i;
 
 retry:
-	spin_lock(&files_lock);
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-		struct vfsmount *mnt;
-		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-		       continue;
-		if (!file_count(f))
-			continue;
-		if (!(f->f_mode & FMODE_WRITE))
-			continue;
-		spin_lock(&f->f_lock);
-		f->f_mode &= ~FMODE_WRITE;
-		spin_unlock(&f->f_lock);
-		if (file_check_writeable(f) != 0)
-			continue;
-		file_release_write(f);
-		mnt = mntget(f->f_path.mnt);
-		/* This can sleep, so we can't hold the spinlock. */
-		spin_unlock(&files_lock);
-		mnt_drop_write(mnt);
-		mntput(mnt);
-		goto retry;
+	file_list_lock_all();
+	for_each_possible_cpu(i) {
+		struct file *f;
+		struct list_head *list;
+
+#ifdef CONFIG_SMP
+		list = per_cpu_ptr(sb->s_files, i);
+#else
+		list = &sb->s_files;
+#endif
+		list_for_each_entry(f, list, f_u.fu_list) {
+			struct vfsmount *mnt;
+			if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+			       continue;
+			if (!file_count(f))
+				continue;
+			if (!(f->f_mode & FMODE_WRITE))
+				continue;
+			spin_lock(&f->f_lock);
+			f->f_mode &= ~FMODE_WRITE;
+			spin_unlock(&f->f_lock);
+			if (file_check_writeable(f) != 0)
+				continue;
+			file_release_write(f);
+			mnt = mntget(f->f_path.mnt);
+			/* This can sleep, so we can't hold the spinlock. */
+			file_list_unlock_all();
+			mnt_drop_write(mnt);
+			mntput(mnt);
+			goto retry;
+		}
 	}
-	spin_unlock(&files_lock);
+	file_list_unlock_all();
 }
 
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
+	int i;
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -423,5 +494,7 @@ void __init files_init(unsigned long mem
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(files_cpulock, i));
 	percpu_counter_init(&nr_files, 0);
 } 
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c
+++ linux-2.6/fs/super.c
@@ -62,7 +62,22 @@ static struct super_block *alloc_super(s
 			s = NULL;
 			goto out;
 		}
+#ifdef CONFIG_SMP
+		s->s_files = alloc_percpu(struct list_head);
+		if (!s->s_files) {
+			security_sb_free(s);
+			kfree(s);
+			s = NULL;
+			goto out;
+		} else {
+			int i;
+
+			for_each_possible_cpu(i)
+				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+		}
+#else
 		INIT_LIST_HEAD(&s->s_files);
+#endif
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -117,6 +132,9 @@ out:
  */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+	free_percpu(s->s_files);
+#endif
 	security_sb_free(s);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -924,6 +924,9 @@ struct file {
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
 	spinlock_t		f_lock;  /* f_ep_links, f_flags, no IRQ */
+#ifdef CONFIG_SMP
+	int			f_sb_list_cpu;
+#endif
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
@@ -1340,7 +1343,11 @@ struct super_block {
 
 	struct list_head	s_inodes;	/* all inodes */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
+#ifdef CONFIG_SMP
+	struct list_head	*s_files;
+#else
 	struct list_head	s_files;
+#endif
 	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */

next prev parent reply	other threads:[~2010-03-16  9:46 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-03-16  9:44 [patch 1/2] fs: cleanup files_lock Nick Piggin
2010-03-16  9:46 ` Nick Piggin [this message]
2010-03-16 14:41 ` Andi Kleen
2010-03-17 14:16 ` Greg KH
2010-03-17 14:38   ` Nick Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100316094630.GN2869@laptop \
    --to=npiggin@suse.de \
    --cc=ak@linux.intel.com \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=ebiederm@xmission.com \
    --cc=fmayhar@google.com \
    --cc=gregkh@suse.de \
    --cc=johnstul@us.ibm.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@ZenIV.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).