All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Andrew Morton <akpm@osdl.org>, linux-kernel@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH 4/7] fs: break the file_list_lock for sb->s_files
Date: Sun, 28 Jan 2007 12:51:22 +0100	[thread overview]
Message-ID: <20070128120510.090289000@programming.kicks-ass.net> (raw)
In-Reply-To: 20070128115118.837777000@programming.kicks-ass.net

[-- Attachment #1: s_files.patch --]
[-- Type: text/plain, Size: 13785 bytes --]

Break the protection of sb->s_files out from under the global file_list_lock.

sb->s_files is converted to a lock_list. furthermore to prevent the 
lock_list_head of getting too contended with concurrent add operations
the add is buffered in per cpu filevecs.

This would ordinarily require a flush before a delete operation - to ensure
the to be deleted entry is indeed added to the list. This is avoided by 
storing a pointer to the filevec location in the not yet used list_head.
This pointer can then be used to clear the filevec entry before its actually
added.

The file_flag mess is a bit unfortunate - this can be removed by also
converting tty->tty_files to a lock_list (TODO).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/dquot.c                   |   10 +-
 fs/file_table.c              |  170 +++++++++++++++++++++++++++++++++++++++----
 fs/open.c                    |    2 
 fs/proc/generic.c            |    8 --
 fs/super.c                   |    7 -
 include/linux/fs.h           |   23 +++++
 mm/readahead.c               |    2 
 security/selinux/selinuxfs.c |    9 +-
 8 files changed, 194 insertions(+), 37 deletions(-)

Index: linux-2.6/fs/dquot.c
===================================================================
--- linux-2.6.orig/fs/dquot.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/dquot.c	2007-01-27 21:07:44.000000000 +0100
@@ -688,23 +688,21 @@ static int dqinit_needed(struct inode *i
 /* This routine is guarded by dqonoff_mutex mutex */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
-	struct list_head *p;
+	struct file *filp;
 
 restart:
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = filp->f_path.dentry->d_inode;
 		if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) {
 			struct dentry *dentry = dget(filp->f_path.dentry);
-			file_list_unlock();
+			lock_list_for_each_entry_stop(filp, f_u.fu_llist);
 			sb->dq_op->initialize(inode, type);
 			dput(dentry);
 			/* As we may have blocked we had better restart... */
 			goto restart;
 		}
 	}
-	file_list_unlock();
 }
 
 /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/file_table.c	2007-01-27 21:07:44.000000000 +0100
@@ -113,7 +113,7 @@ struct file *get_empty_filp(void)
 		goto fail_sec;
 
 	tsk = current;
-	INIT_LIST_HEAD(&f->f_u.fu_list);
+	INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist);
 	atomic_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_uid = tsk->fsuid;
@@ -245,32 +245,175 @@ void put_filp(struct file *file)
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+enum {
+	FILEVEC_SIZE = 15
+};
+
+struct filevec {
+	unsigned long nr;
+	struct file *files[FILEVEC_SIZE];
+};
+
+static DEFINE_PER_CPU(struct filevec, sb_fvec);
+
+static inline unsigned int filevec_size(struct filevec *fvec)
 {
-	if (!list)
-		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+	return FILEVEC_SIZE - fvec->nr;
+}
+
+static inline unsigned int filevec_count(struct filevec *fvec)
+{
+	return fvec->nr;
+}
+
+static inline void filevec_reinit(struct filevec *fvec)
+{
+	fvec->nr = 0;
+}
+
+static inline unsigned int filevec_add(struct filevec *fvec, struct file *filp)
+{
+	rcu_assign_pointer(fvec->files[fvec->nr], filp);
+
+	/*
+	 * Here we do icky stuff in order to avoid flushing the per cpu filevec
+	 * on list removal.
+	 *
+	 * We store the location on the per cpu filevec in the as of yet unused
+	 * fu_llist.next field and toggle bit 0 to indicate we done so. This
+	 * allows the removal code to set the filevec entry to NULL, thereby
+	 * avoiding the list add.
+	 *
+	 * Abuse the fu_llist.lock for protection.
+	 */
+	spin_lock(&filp->f_u.fu_llist.lock);
+	filp->f_u.fu_llist.next = (void *)&fvec->files[fvec->nr];
+	__set_bit(0, (void *)&filp->f_u.fu_llist.next);
+	spin_unlock(&filp->f_u.fu_llist.lock);
+
+	fvec->nr++;
+	return filevec_size(fvec);
+}
+
+static void __filevec_add(struct filevec *fvec)
+{
+	int i;
+
+	for (i = 0; i < filevec_count(fvec); i++) {
+		struct file *filp;
+
+		/*
+		 * see the comment in filevec_add();
+		 * need RCU because a concurrent remove might have deleted
+		 * the entry from under us.
+		 */
+		rcu_read_lock();
+		filp = rcu_dereference(fvec->files[i]);
+		/*
+		 * the simple case, its gone - NEXT!
+		 */
+		if (!filp) {
+			rcu_read_unlock();
+			continue;
+		}
+
+		spin_lock(&filp->f_u.fu_llist.lock);
+		/*
+		 * If the entry really is still there, add it!
+		 */
+		if (rcu_dereference(fvec->files[i])) {
+			struct super_block *sb =
+				filp->f_mapping->host->i_sb;
+
+			__lock_list_add(&filp->f_u.fu_llist, &sb->s_files);
+		}
+		spin_unlock(&filp->f_u.fu_llist.lock);
+		rcu_read_unlock();
+	}
+	filevec_reinit(fvec);
+}
+
+static void filevec_add_drain(void)
+{
+	struct filevec *fvec = &get_cpu_var(sb_fvec);
+	if (filevec_count(fvec))
+		__filevec_add(fvec);
+	put_cpu_var(sb_fvec);
 }
 
+static void filevec_add_drain_per_cpu(struct work_struct *dummy)
+{
+	filevec_add_drain();
+}
+
+int filevec_add_drain_all(void)
+{
+	return schedule_on_each_cpu(filevec_add_drain_per_cpu);
+}
+EXPORT_SYMBOL_GPL(filevec_add_drain_all);
+
 void file_kill(struct file *file)
 {
-	if (!list_empty(&file->f_u.fu_list)) {
+	if (file_flag(file, F_SUPERBLOCK)) {
+		void **ptr;
+
+		file_flag_clear(file, F_SUPERBLOCK);
+
+		/*
+		 * If bit 0 of the fu_llist.next pointer is set we're still
+		 * enqueued on a per cpu filevec, in that case clear the entry
+		 * and we're done.
+		 */
+		spin_lock(&file->f_u.fu_llist.lock);
+		ptr = (void **)file->f_u.fu_llist.next;
+		if (__test_and_clear_bit(0, (void *)&ptr)) {
+			rcu_assign_pointer(*ptr, NULL);
+			INIT_LIST_HEAD(&file->f_u.fu_llist.head);
+			spin_unlock(&file->f_u.fu_llist.lock);
+			return;
+		}
+		spin_unlock(&file->f_u.fu_llist.lock);
+
+		if (!list_empty(&file->f_u.fu_list))
+			lock_list_del_init(&file->f_u.fu_llist);
+
+	} else if (!list_empty(&file->f_u.fu_list)) {
 		file_list_lock();
 		list_del_init(&file->f_u.fu_list);
 		file_list_unlock();
 	}
 }
 
+void file_move(struct file *file, struct list_head *list)
+{
+	struct super_block *sb;
+
+	if (!list)
+		return;
+
+	file_kill(file);
+
+	sb = file->f_mapping->host->i_sb;
+	if (list == &sb->s_files.head) {
+		struct filevec *fvec = &get_cpu_var(sb_fvec);
+		file_flag_set(file, F_SUPERBLOCK);
+		if (!filevec_add(fvec, file))
+			__filevec_add(fvec);
+		put_cpu_var(sb_fvec);
+	} else {
+		file_list_lock();
+		list_add(&file->f_u.fu_list, list);
+		file_list_unlock();
+	}
+}
+
 int fs_may_remount_ro(struct super_block *sb)
 {
-	struct list_head *p;
+	struct file *file;
 
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *file = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -281,10 +424,9 @@ int fs_may_remount_ro(struct super_block
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
 	}
-	file_list_unlock();
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	lock_list_for_each_entry_stop(file, f_u.fu_llist);
 	return 0;
 }
 
Index: linux-2.6/fs/open.c
===================================================================
--- linux-2.6.orig/fs/open.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/open.c	2007-01-27 21:07:44.000000000 +0100
@@ -692,7 +692,7 @@ static struct file *__dentry_open(struct
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
-	file_move(f, &inode->i_sb->s_files);
+	file_move(f, &inode->i_sb->s_files.head);
 
 	if (!open && f->f_op)
 		open = f->f_op->open;
Index: linux-2.6/fs/proc/generic.c
===================================================================
--- linux-2.6.orig/fs/proc/generic.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/proc/generic.c	2007-01-27 21:07:44.000000000 +0100
@@ -549,15 +549,14 @@ static int proc_register(struct proc_dir
  */
 static void proc_kill_inodes(struct proc_dir_entry *de)
 {
-	struct list_head *p;
+	struct file *filp;
 	struct super_block *sb = proc_mnt->mnt_sb;
 
 	/*
 	 * Actually it's a partial revoke().
 	 */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
 		struct inode * inode;
 		const struct file_operations *fops;
@@ -571,7 +570,6 @@ static void proc_kill_inodes(struct proc
 		filp->f_op = NULL;
 		fops_put(fops);
 	}
-	file_list_unlock();
 }
 
 static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/super.c	2007-01-27 21:07:44.000000000 +0100
@@ -67,7 +67,7 @@ static struct super_block *alloc_super(s
 		}
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
-		INIT_LIST_HEAD(&s->s_files);
+		INIT_LOCK_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -568,12 +568,11 @@ static void mark_files_ro(struct super_b
 {
 	struct file *f;
 
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	filevec_add_drain_all();
+	lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) {
 		if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
 			f->f_mode &= ~FMODE_WRITE;
 	}
-	file_list_unlock();
 }
 
 /**
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2007-01-22 22:43:28.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2007-01-27 21:07:44.000000000 +0100
@@ -275,6 +275,7 @@ extern int dir_notify_enable;
 #include <linux/cache.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/lock_list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
@@ -709,9 +710,13 @@ struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
 	 * fu_rcuhead for RCU freeing
+	 * fu_llist is used for the superblock s_files list; its crucial that
+	 * the spinlock contained therein is not clobbered by other uses of
+	 * the union.
 	 */
 	union {
 		struct list_head	fu_list;
+		struct lock_list_head	fu_llist;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
@@ -744,9 +749,25 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
+/*
+ * steal the upper 8 bits from the read-a-head flags
+ */
+#define F_SHIFT		24
+
+#define F_SUPERBLOCK	0
+
+#define file_flag_set(file, flag)	\
+	__set_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag_clear(file, flag)	\
+	__clear_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag(file, flag)		\
+	test_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+
 #define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
+extern int filevec_add_drain_all(void);
+
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
@@ -928,7 +949,7 @@ struct super_block {
 	struct list_head	s_dirty;	/* dirty inodes */
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
-	struct list_head	s_files;
+	struct lock_list_head	s_files;
 
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c	2007-01-22 22:43:29.000000000 +0100
+++ linux-2.6/mm/readahead.c	2007-01-27 21:07:44.000000000 +0100
@@ -69,7 +69,7 @@ static inline void reset_ahead_window(st
 static inline void ra_off(struct file_ra_state *ra)
 {
 	ra->start = 0;
-	ra->flags = 0;
+	ra->flags &= (~0UL) << F_SHIFT;
 	ra->size = 0;
 	reset_ahead_window(ra);
 	return;
Index: linux-2.6/security/selinux/selinuxfs.c
===================================================================
--- linux-2.6.orig/security/selinux/selinuxfs.c	2007-01-13 21:04:19.000000000 +0100
+++ linux-2.6/security/selinux/selinuxfs.c	2007-01-27 21:10:48.000000000 +0100
@@ -940,7 +940,8 @@ static struct file_operations sel_commit
  * fs/proc/generic.c proc_kill_inodes */
 static void sel_remove_bools(struct dentry *de)
 {
-	struct list_head *p, *node;
+	struct list_head *node;
+	struct file *filp;
 	struct super_block *sb = de->d_sb;
 
 	spin_lock(&dcache_lock);
@@ -962,9 +963,8 @@ static void sel_remove_bools(struct dent
 
 	spin_unlock(&dcache_lock);
 
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
 
 		if (dentry->d_parent != de) {
@@ -972,7 +972,6 @@ static void sel_remove_bools(struct dent
 		}
 		filp->f_op = NULL;
 	}
-	file_list_unlock();
 }
 
 #define BOOL_DIR_NAME "booleans"

--


  parent reply	other threads:[~2007-01-28 12:11 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-28 11:51 [PATCH 0/7] breaking the global file_list_lock Peter Zijlstra
2007-01-28 11:51 ` [PATCH 1/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
2007-01-28 11:51 ` [PATCH 3/7] barrier: a scalable synchonisation barrier Peter Zijlstra
2007-01-28 14:39   ` Christoph Hellwig
2007-01-28 15:24     ` Ingo Molnar
2007-01-28 15:34       ` Christoph Hellwig
2007-01-31 19:12       ` Paul E. McKenney
2007-01-31 21:13         ` Oleg Nesterov
2007-01-31 21:30           ` Oleg Nesterov
2007-01-31 21:48           ` Peter Zijlstra
2007-01-31 23:32             ` Paul E. McKenney
2007-02-01  0:03               ` Peter Zijlstra
2007-02-01  0:48                 ` Paul E. McKenney
2007-02-01 16:00                   ` Oleg Nesterov
2007-02-01 21:38                     ` Paul E. McKenney
2007-02-02 11:56                       ` Oleg Nesterov
2007-02-02 12:01                         ` Peter Zijlstra
2007-02-02 17:13                         ` Paul E. McKenney
2007-02-03 16:38                   ` Oleg Nesterov
2007-02-04  0:23                     ` Paul E. McKenney
2007-02-04  3:24                       ` Alan Stern
2007-02-04  5:46                         ` Paul E. McKenney
2007-01-28 11:51 ` Peter Zijlstra [this message]
2007-01-28 14:43   ` [PATCH 4/7] fs: break the file_list_lock for sb->s_files Christoph Hellwig
2007-01-28 15:21     ` Ingo Molnar
2007-01-28 15:30       ` Christoph Hellwig
2007-01-28 15:32         ` Peter Zijlstra
2007-01-28 15:36           ` Christoph Hellwig
2007-01-28 15:44         ` Ingo Molnar
2007-01-28 16:25         ` Bill Huey
2007-01-28 11:51 ` [PATCH 5/7] fs: restore previous sb->s_files iteration semantics Peter Zijlstra
2007-01-28 11:51 ` [PATCH 6/7] schedule_on_each_cpu_wq() Peter Zijlstra
2007-01-28 11:51 ` [PATCH 7/7] fs: fixup filevec_add_drain_all Peter Zijlstra
2007-01-28 12:16 ` [PATCH 8/7] fs: free_write_pipe() fix Peter Zijlstra
2007-01-28 14:43 ` [PATCH 0/7] breaking the global file_list_lock Christoph Hellwig
2007-01-28 15:11   ` Christoph Hellwig
2007-01-28 15:29     ` Peter Zijlstra
2007-01-28 15:33       ` Christoph Hellwig
2007-01-29 13:32     ` Stephen Smalley
2007-01-29 18:02       ` Christoph Hellwig
2007-01-28 15:24   ` Ingo Molnar
2007-01-28 16:52     ` Martin J. Bligh
2007-01-28 17:04       ` lockmeter Christoph Hellwig
2007-01-28 17:38         ` lockmeter Martin J. Bligh
2007-01-28 18:01           ` lockmeter Bill Huey
2007-01-28 19:26             ` lockmeter Ingo Molnar
2007-01-28 21:17             ` lockmeter Ingo Molnar
2007-01-29  5:27               ` lockmeter Bill Huey
2007-01-29 10:26                 ` lockmeter Bill Huey
2007-01-29  1:08         ` lockmeter Arjan van de Ven
2007-01-29  1:12           ` lockmeter Martin J. Bligh
2007-01-28 18:41   ` [PATCH 0/7] breaking the global file_list_lock Ingo Molnar
2007-01-28 20:38     ` Christoph Hellwig
2007-01-28 21:05       ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070128120510.090289000@programming.kicks-ass.net \
    --to=a.p.zijlstra@chello.nl \
    --cc=akpm@osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.