All of lore.kernel.org
 help / color / mirror / Atom feed
* fs: break out inode LRU operations from node_lock
@ 2010-10-27  4:23 Dave Chinner
  2010-10-27  4:23 ` [PATCH 1/4] fs: protect inode->i_state with inode->i_lock Dave Chinner
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  4:23 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, linux-kernel

Hi Al,

The following patches break the inode LRU operations and the first half of
iput_final() out from under the inode_lock. I included the dispose_one_inode
factoring patch to isolate the inode_lock from iput_final() completely. It's
easy enough to drop if you don't want that right now.

It passes xfstests on 1-, 2- and 8-way VMs, survives 8-way parallel
create/traverse/unlink workloads with 0, 1 and 65536 byte files on XFS and
ext4, and shows no problems with looping 50-client dbench runs on XFS or ext4.

The patches should apply to your current merge-stem tree.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/4] fs: protect inode->i_state with inode->i_lock
  2010-10-27  4:23 fs: break out inode LRU operations from node_lock Dave Chinner
@ 2010-10-27  4:23 ` Dave Chinner
  2010-10-27  7:07   ` Christian Stroetmann
  2010-10-27  8:58   ` Christoph Hellwig
  2010-10-27  4:23 ` [PATCH 2/4] fs: factor inode disposal Dave Chinner
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  4:23 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, linux-kernel

From: Dave Chinner <dchinner@redhat.com>

Protect inode state transitions and validity checks with the
inode->i_lock. This enables us to make inode state transitions
independently of the inode_lock and is the first step to peeling
away the inode_lock from the code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/drop_caches.c       |    6 +++-
 fs/fs-writeback.c      |   31 ++++++++++++++--
 fs/inode.c             |   95 +++++++++++++++++++++++++++++++++++++++---------
 fs/notify/inode_mark.c |   17 ++++++---
 fs/quota/dquot.c       |    6 +++-
 5 files changed, 127 insertions(+), 28 deletions(-)

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c21..c495fc3 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,8 +18,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
+		spin_unlock(&inode->i_lock);
 		if (inode->i_mapping->nrpages == 0)
 			continue;
 		__iget(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f6af81a..bd9204d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -293,9 +293,11 @@ static void inode_wait_for_writeback(struct inode *inode)
 
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	 while (inode->i_state & I_SYNC) {
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
 		spin_lock(&inode_lock);
+		spin_lock(&inode->i_lock);
 	}
 }
 
@@ -319,6 +321,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	unsigned dirty;
 	int ret;
 
+	spin_lock(&inode->i_lock);
 	if (!atomic_read(&inode->i_count))
 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 	else
@@ -334,6 +337,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * completed a full scan of b_io.
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
+			spin_unlock(&inode->i_lock);
 			requeue_io(inode);
 			return 0;
 		}
@@ -349,6 +353,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	inode->i_state |= I_SYNC;
 	inode->i_state &= ~I_DIRTY_PAGES;
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
@@ -370,8 +375,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * write_inode()
 	 */
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	dirty = inode->i_state & I_DIRTY;
 	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
@@ -381,6 +388,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -422,6 +430,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	inode_sync_complete(inode);
+	spin_unlock(&inode->i_lock);
 	return ret;
 }
 
@@ -492,10 +501,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		 * kind does not need peridic writeout yet, and for the latter
 		 * kind writeout is handled by the freer.
 		 */
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			requeue_io(inode);
 			continue;
 		}
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
@@ -681,7 +693,9 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
+			spin_lock(&inode->i_lock);
 			inode_wait_for_writeback(inode);
+			spin_unlock(&inode->i_lock);
 		}
 		spin_unlock(&inode_lock);
 	}
@@ -947,6 +961,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
@@ -958,7 +973,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * superblock list, based upon its state.
 		 */
 		if (inode->i_state & I_SYNC)
-			goto out;
+			goto out_unlock_inode;
 
 		/*
 		 * Only add valid (hashed) inodes to the superblock's
@@ -966,11 +981,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 */
 		if (!S_ISBLK(inode->i_mode)) {
 			if (inode_unhashed(inode))
-				goto out;
+				goto out_unlock_inode;
 		}
 		if (inode->i_state & I_FREEING)
-			goto out;
+			goto out_unlock_inode;
 
+		spin_unlock(&inode->i_lock);
 		/*
 		 * If the inode was already on b_dirty/b_io/b_more_io, don't
 		 * reposition it (that would break b_dirty time-ordering).
@@ -995,7 +1011,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
 		}
+		goto out;
 	}
+out_unlock_inode:
+	spin_unlock(&inode->i_lock);
 out:
 	spin_unlock(&inode_lock);
 
@@ -1043,8 +1062,12 @@ static void wait_sb_inodes(struct super_block *sb)
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
+		spin_unlock(&inode->i_lock);
 		mapping = inode->i_mapping;
 		if (mapping->nrpages == 0)
 			continue;
diff --git a/fs/inode.c b/fs/inode.c
index a6d6068..eaba6ce 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -26,6 +26,17 @@
 #include <linux/posix_acl.h>
 
 /*
+ * inode locking rules.
+ *
+ * inode->i_lock protects:
+ *   i_state
+ *
+ * Lock ordering:
+ * inode_lock
+ *   inode->i_lock
+ */
+
+/*
  * This is needed for the following functions:
  *  - inode_has_buffers
  *  - invalidate_bdev
@@ -429,7 +440,9 @@ void end_writeback(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
+	spin_lock(&inode->i_lock);
 	inode->i_state = I_FREEING | I_CLEAR;
+	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(end_writeback);
 
@@ -498,12 +511,17 @@ void evict_inodes(struct super_block *sb)
 		if (atomic_read(&inode->i_count))
 			continue;
 
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			WARN_ON(1);
 			continue;
 		}
 
 		inode->i_state |= I_FREEING;
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			percpu_counter_dec(&nr_inodes_unused);
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -511,8 +529,6 @@ void evict_inodes(struct super_block *sb)
 		 */
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
 	}
 	spin_unlock(&inode_lock);
 
@@ -521,7 +537,7 @@ void evict_inodes(struct super_block *sb)
 }
 
 /**
- * invalidate_inodes	- attempt to free all inodes on a superblock
+ * nvalidate_inodes	- attempt to free all inodes on a superblock
  * @sb:		superblock to operate on
  *
  * Attempts to free all inodes for a given superblock.  If there were any
@@ -537,14 +553,21 @@ int invalidate_inodes(struct super_block *sb)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 		if (atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
 		}
 
 		inode->i_state |= I_FREEING;
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			percpu_counter_dec(&nr_inodes_unused);
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -552,8 +575,6 @@ int invalidate_inodes(struct super_block *sb)
 		 */
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
 	}
 	spin_unlock(&inode_lock);
 
@@ -612,8 +633,10 @@ static void prune_icache(int nr_to_scan)
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 */
+		spin_lock(&inode->i_lock);
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
+			spin_unlock(&inode->i_lock);
 			list_del_init(&inode->i_lru);
 			percpu_counter_dec(&nr_inodes_unused);
 			continue;
@@ -621,12 +644,14 @@ static void prune_icache(int nr_to_scan)
 
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
-			list_move(&inode->i_lru, &inode_lru);
 			inode->i_state &= ~I_REFERENCED;
+			spin_unlock(&inode->i_lock);
+			list_move(&inode->i_lru, &inode_lru);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
@@ -637,11 +662,15 @@ static void prune_icache(int nr_to_scan)
 			if (inode != list_entry(inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
-			if (!can_unuse(inode))
+			spin_lock(&inode->i_lock);
+			if (!can_unuse(inode)) {
+				spin_unlock(&inode->i_lock);
 				continue;
+			}
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -708,10 +737,12 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
+		spin_unlock(&inode->i_lock);
 		__iget(inode);
 		return inode;
 	}
@@ -734,10 +765,12 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
+		spin_unlock(&inode->i_lock);
 		__iget(inode);
 		return inode;
 	}
@@ -803,8 +836,10 @@ struct inode *new_inode(struct super_block *sb)
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode_lock);
-		__inode_sb_list_add(inode);
+		spin_lock(&inode->i_lock);
 		inode->i_state = 0;
+		spin_unlock(&inode->i_lock);
+		__inode_sb_list_add(inode);
 		spin_unlock(&inode_lock);
 	}
 	return inode;
@@ -871,9 +906,11 @@ static struct inode *get_new_inode(struct super_block *sb,
 			if (set(inode, data))
 				goto set_failed;
 
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
+			spin_unlock(&inode->i_lock);
 			hlist_add_head(&inode->i_hash, head);
 			__inode_sb_list_add(inode);
-			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
 			/* Return the locked inode with I_NEW set, the
@@ -917,10 +954,12 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		/* We released the lock, so.. */
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
+			spin_lock(&inode->i_lock);
 			inode->i_ino = ino;
+			inode->i_state = I_NEW;
+			spin_unlock(&inode->i_lock);
 			hlist_add_head(&inode->i_hash, head);
 			__inode_sb_list_add(inode);
-			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
 			/* Return the locked inode with I_NEW set, the
@@ -1005,15 +1044,19 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
+	spin_lock(&inode->i_lock);
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
 		__iget(inode);
-	else
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
 		/*
 		 * Handle the case where s_op->clear_inode is not been
 		 * called yet, and somebody is calling igrab
 		 * while the inode is getting freed.
 		 */
 		inode = NULL;
+	}
 	spin_unlock(&inode_lock);
 	return inode;
 }
@@ -1242,7 +1285,9 @@ int insert_inode_locked(struct inode *inode)
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 
+	spin_lock(&inode->i_lock);
 	inode->i_state |= I_NEW;
+	spin_unlock(&inode->i_lock);
 	while (1) {
 		struct hlist_node *node;
 		struct inode *old = NULL;
@@ -1252,8 +1297,11 @@ int insert_inode_locked(struct inode *inode)
 				continue;
 			if (old->i_sb != sb)
 				continue;
-			if (old->i_state & (I_FREEING|I_WILL_FREE))
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
 				continue;
+			}
 			break;
 		}
 		if (likely(!node)) {
@@ -1261,6 +1309,7 @@ int insert_inode_locked(struct inode *inode)
 			spin_unlock(&inode_lock);
 			return 0;
 		}
+		spin_unlock(&old->i_lock);
 		__iget(old);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
@@ -1279,7 +1328,9 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 	struct super_block *sb = inode->i_sb;
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 
+	spin_lock(&inode->i_lock);
 	inode->i_state |= I_NEW;
+	spin_unlock(&inode->i_lock);
 
 	while (1) {
 		struct hlist_node *node;
@@ -1291,8 +1342,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 				continue;
 			if (!test(old, data))
 				continue;
-			if (old->i_state & (I_FREEING|I_WILL_FREE))
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
 				continue;
+			}
 			break;
 		}
 		if (likely(!node)) {
@@ -1300,6 +1354,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 			spin_unlock(&inode_lock);
 			return 0;
 		}
+		spin_unlock(&old->i_lock);
 		__iget(old);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
@@ -1346,6 +1401,9 @@ static void iput_final(struct inode *inode)
 	const struct super_operations *op = inode->i_sb->s_op;
 	int drop;
 
+	spin_lock(&inode->i_lock);
+	WARN_ON(inode->i_state & I_NEW);
+
 	if (op && op->drop_inode)
 		drop = op->drop_inode(inode);
 	else
@@ -1357,21 +1415,23 @@ static void iput_final(struct inode *inode)
 			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
 				inode_lru_list_add(inode);
 			}
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lock);
 			return;
 		}
-		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
 		spin_lock(&inode_lock);
+		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 		__remove_inode_hash(inode);
 	}
 
-	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
+	spin_unlock(&inode->i_lock);
 
 	/*
 	 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -1592,6 +1652,7 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 	schedule();
 	finish_wait(wq, &wait.wait);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 21ed106..08f0d16 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -249,8 +249,12 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * If i_count is zero, the inode cannot have any watches and
@@ -272,10 +276,13 @@ void fsnotify_unmount_inodes(struct list_head *list)
 
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
-		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
+		    atomic_read(&next_i->i_count)) {
+			spin_lock(&next_i->i_lock);
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+				__iget(next_i);
+				need_iput = next_i;
+			}
+			spin_unlock(&next_i->i_lock);
 		}
 
 		/*
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316..fb7c2c0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -898,8 +898,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
+		spin_unlock(&inode->i_lock);
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
 			reserved = 1;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/4] fs: factor inode disposal
  2010-10-27  4:23 fs: break out inode LRU operations from node_lock Dave Chinner
  2010-10-27  4:23 ` [PATCH 1/4] fs: protect inode->i_state with inode->i_lock Dave Chinner
@ 2010-10-27  4:23 ` Dave Chinner
  2010-10-27  7:55   ` Christoph Hellwig
  2010-10-27  9:06   ` Christoph Hellwig
  2010-10-27  4:23 ` [PATCH 3/4] fs: Lock the inode LRU list separately Dave Chinner
  2010-10-27  4:23 ` [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache Dave Chinner
  3 siblings, 2 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  4:23 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, linux-kernel

From: Dave Chinner <dchinner@redhat.com>

We have a couple of places that dispose of inodes. factor the
disposal into a common helper dispose_one_inode() to isolate this
code and make it simpler to peel away the inode_lock from the code.

While doing this, change the logic flow in iput_final() to separate
the different cases that need to be handled to make the transitions
the inode goes through more obvious.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/inode.c |   75 +++++++++++++++++++++++++++++++----------------------------
 1 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index eaba6ce..f134aa4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -464,6 +464,32 @@ static void evict(struct inode *inode)
 }
 
 /*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to but avoiding unnecessary lock round-trips for the lists it is no longer
+ * on.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode_inode). The caller is responsisble for setting this.
+ */
+static void dispose_one_inode(struct inode *inode)
+{
+	BUG_ON(!(inode->i_state & I_FREEING));
+
+	spin_lock(&inode_lock);
+	list_del_init(&inode->i_wb_list);
+	__remove_inode_hash(inode);
+	__inode_sb_list_del(inode);
+	spin_unlock(&inode_lock);
+
+	evict(inode);
+
+	wake_up_inode(inode);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	destroy_inode(inode);
+}
+
+/*
  * dispose_list - dispose of the contents of a local list
  * @head: the head of the list to free
  *
@@ -478,15 +504,7 @@ static void dispose_list(struct list_head *head)
 		inode = list_first_entry(head, struct inode, i_lru);
 		list_del_init(&inode->i_lru);
 
-		evict(inode);
-
-		spin_lock(&inode_lock);
-		__remove_inode_hash(inode);
-		__inode_sb_list_del(inode);
-		spin_unlock(&inode_lock);
-
-		wake_up_inode(inode);
-		destroy_inode(inode);
+		dispose_one_inode(inode);
 	}
 }
 
@@ -528,7 +546,6 @@ void evict_inodes(struct super_block *sb)
 		 * set so that it won't get moved back on there if it is dirty.
 		 */
 		list_move(&inode->i_lru, &dispose);
-		list_del_init(&inode->i_wb_list);
 	}
 	spin_unlock(&inode_lock);
 
@@ -574,7 +591,6 @@ int invalidate_inodes(struct super_block *sb)
 		 * set so that it won't get moved back on there if it is dirty.
 		 */
 		list_move(&inode->i_lru, &dispose);
-		list_del_init(&inode->i_wb_list);
 	}
 	spin_unlock(&inode_lock);
 
@@ -677,7 +693,6 @@ static void prune_icache(int nr_to_scan)
 		 * set so that it won't get moved back on there if it is dirty.
 		 */
 		list_move(&inode->i_lru, &freeable);
-		list_del_init(&inode->i_wb_list);
 		percpu_counter_dec(&nr_inodes_unused);
 	}
 	if (current_is_kswapd())
@@ -1409,16 +1424,16 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
+	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+		inode->i_state |= I_REFERENCED;
+		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+			inode_lru_list_add(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_lock);
+		return;
+	}
+
 	if (!drop) {
-		if (sb->s_flags & MS_ACTIVE) {
-			inode->i_state |= I_REFERENCED;
-			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-				inode_lru_list_add(inode);
-			}
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_lock);
-			return;
-		}
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
@@ -1427,26 +1442,14 @@ static void iput_final(struct inode *inode)
 		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
-		__remove_inode_hash(inode);
 	}
 
 	inode->i_state |= I_FREEING;
-	spin_unlock(&inode->i_lock);
-
-	/*
-	 * Move the inode off the IO lists and LRU once I_FREEING is
-	 * set so that it won't get moved back on there if it is dirty.
-	 */
 	inode_lru_list_del(inode);
-	list_del_init(&inode->i_wb_list);
-
-	__inode_sb_list_del(inode);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
-	evict(inode);
-	remove_inode_hash(inode);
-	wake_up_inode(inode);
-	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
-	destroy_inode(inode);
+
+	dispose_one_inode(inode);
 }
 
 /**
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-27  4:23 fs: break out inode LRU operations from node_lock Dave Chinner
  2010-10-27  4:23 ` [PATCH 1/4] fs: protect inode->i_state with inode->i_lock Dave Chinner
  2010-10-27  4:23 ` [PATCH 2/4] fs: factor inode disposal Dave Chinner
@ 2010-10-27  4:23 ` Dave Chinner
  2010-10-27  7:08   ` Christian Stroetmann
  2010-10-27  9:05   ` Christoph Hellwig
  2010-10-27  4:23 ` [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache Dave Chinner
  3 siblings, 2 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  4:23 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, linux-kernel

From: Dave Chinner <dchinner@redhat.com>

Introduce the inode_lru_lock to protect the inode_lru list. This
lock is nested inside the inode->i_lock to allow the inode to be
added to the LRU list in iput_final without needing to deal with
lock inversions. This keeps iput_final() clean and neat.

Further, where marking the inode I_FREEING and removing it from the
LRU, move the LRU list manipulation within the inode->i_lock to keep
the list manipulation consistent with iput_final. This also means
that most of the open coded LRU list removal + unused inode
accounting can now use the inode_lru_list_del() wrappers which
cleans the code up further.

However, this locking change means what the LRU traversal in
prune_icache() inverts this lock ordering and needs to use trylock
semantics on the inode->i_lock to avoid deadlocking. In these cases,
if we fail to lock the inode we move it to the back of the LRU to
prevent spinning on it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/inode.c |   45 +++++++++++++++++++++++++++++----------------
 1 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index f134aa4..e04371e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -30,10 +30,13 @@
  *
  * inode->i_lock protects:
  *   i_state
+ * inode_lru_lock protects:
+ *   inode_lru, i_lru
  *
  * Lock ordering:
  * inode_lock
  *   inode->i_lock
+ *     inode_lru_lock
  */
 
 /*
@@ -83,6 +86,7 @@ static unsigned int i_hash_shift __read_mostly;
  */
 
 static LIST_HEAD(inode_lru);
+static DEFINE_SPINLOCK(inode_lru_lock);
 static struct hlist_head *inode_hashtable __read_mostly;
 
 /*
@@ -344,16 +348,20 @@ EXPORT_SYMBOL(ihold);
 static void inode_lru_list_add(struct inode *inode)
 {
 	if (list_empty(&inode->i_lru)) {
+		spin_lock(&inode_lru_lock);
 		list_add(&inode->i_lru, &inode_lru);
 		percpu_counter_inc(&nr_inodes_unused);
+		spin_unlock(&inode_lru_lock);
 	}
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
 	if (!list_empty(&inode->i_lru)) {
+		spin_lock(&inode_lru_lock);
 		list_del_init(&inode->i_lru);
 		percpu_counter_dec(&nr_inodes_unused);
+		spin_unlock(&inode_lru_lock);
 	}
 }
 
@@ -537,15 +545,10 @@ void evict_inodes(struct super_block *sb)
 		}
 
 		inode->i_state |= I_FREEING;
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
 
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
-		list_move(&inode->i_lru, &dispose);
+		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_lock);
 
@@ -582,15 +585,10 @@ int invalidate_inodes(struct super_block *sb)
 		}
 
 		inode->i_state |= I_FREEING;
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
 
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
-		list_move(&inode->i_lru, &dispose);
+		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_lock);
 
@@ -637,6 +635,7 @@ static void prune_icache(int nr_to_scan)
 
 	down_read(&iprune_sem);
 	spin_lock(&inode_lock);
+	spin_lock(&inode_lru_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
 
@@ -646,10 +645,19 @@ static void prune_icache(int nr_to_scan)
 		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 
 		/*
+		 * we are inverting the inode_lru_lock/inode->i_lock here,
+		 * so use a trylock. If we fail to get the lock, just move the
+		 * inode to the back of the list so we don't spin on it.
+		 */
+		if (!spin_trylock(&inode->i_lock)) {
+			list_move(&inode->i_lru, &inode_lru);
+			continue;
+		}
+
+		/*
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 */
-		spin_lock(&inode->i_lock);
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			spin_unlock(&inode->i_lock);
@@ -668,17 +676,21 @@ static void prune_icache(int nr_to_scan)
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_lru_lock);
 			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
 			spin_lock(&inode_lock);
+			spin_lock(&inode_lru_lock);
 
 			if (inode != list_entry(inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
-			spin_lock(&inode->i_lock);
+			/* avoid lock inversions with trylock */
+			if (!spin_trylock(&inode->i_lock))
+				continue;
 			if (!can_unuse(inode)) {
 				spin_unlock(&inode->i_lock);
 				continue;
@@ -699,6 +711,7 @@ static void prune_icache(int nr_to_scan)
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
 		__count_vm_events(PGINODESTEAL, reap);
+	spin_unlock(&inode_lru_lock);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  4:23 fs: break out inode LRU operations from node_lock Dave Chinner
                   ` (2 preceding siblings ...)
  2010-10-27  4:23 ` [PATCH 3/4] fs: Lock the inode LRU list separately Dave Chinner
@ 2010-10-27  4:23 ` Dave Chinner
  2010-10-27  4:40   ` Al Viro
  3 siblings, 1 reply; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  4:23 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, linux-kernel

From: Dave Chinner <dchinner@redhat.com>

Now that inode state changes are protected by the inode->i_lock and
the inode LRU manipulations by the inode_lru_lock, we can remove the
inode_lock from prune_icache and the initial part of iput_final().

instead of using the inode_lock to protect the inode during
iput_final, use the inode->i_lock instead. This protects the inode
against new references being taken while we change the inode state
to I_FREEING, as well as preventing prune_icache from grabbing the
inode while we are manipulating it. Hence we no longer need the
iṉode_lock in iput_final prior to setting I_FREEING on the inode.

For prune_icache, we no longer need the inode_lock to protect the
LRU list, and the inodes themselves are protected against freeing
races by the inode->i_lock. Hence we can lift the inode_lock from
prune_icache as well.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/inode.c |   19 +++++--------------
 1 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index e04371e..b5f1585 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -634,7 +634,6 @@ static void prune_icache(int nr_to_scan)
 	unsigned long reap = 0;
 
 	down_read(&iprune_sem);
-	spin_lock(&inode_lock);
 	spin_lock(&inode_lru_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
@@ -660,8 +659,8 @@ static void prune_icache(int nr_to_scan)
 		 */
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
-			spin_unlock(&inode->i_lock);
 			list_del_init(&inode->i_lru);
+			spin_unlock(&inode->i_lock);
 			percpu_counter_dec(&nr_inodes_unused);
 			continue;
 		}
@@ -669,20 +668,18 @@ static void prune_icache(int nr_to_scan)
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
 			inode->i_state &= ~I_REFERENCED;
-			spin_unlock(&inode->i_lock);
 			list_move(&inode->i_lru, &inode_lru);
+			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lru_lock);
-			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
-			spin_lock(&inode_lock);
 			spin_lock(&inode_lru_lock);
 
 			if (inode != list_entry(inode_lru.next,
@@ -701,8 +698,8 @@ static void prune_icache(int nr_to_scan)
 		spin_unlock(&inode->i_lock);
 
 		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
+		 * Move the inode off the LRU once I_FREEING is set so that it
+		 * won't get moved back on there if it is dirty.
 		 */
 		list_move(&inode->i_lru, &freeable);
 		percpu_counter_dec(&nr_inodes_unused);
@@ -712,7 +709,6 @@ static void prune_icache(int nr_to_scan)
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&inode_lru_lock);
-	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
 	up_read(&iprune_sem);
@@ -1429,7 +1425,6 @@ static void iput_final(struct inode *inode)
 	const struct super_operations *op = inode->i_sb->s_op;
 	int drop;
 
-	spin_lock(&inode->i_lock);
 	WARN_ON(inode->i_state & I_NEW);
 
 	if (op && op->drop_inode)
@@ -1442,16 +1437,13 @@ static void iput_final(struct inode *inode)
 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 			inode_lru_list_add(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
 		return;
 	}
 
 	if (!drop) {
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
-		spin_lock(&inode_lock);
 		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
@@ -1460,7 +1452,6 @@ static void iput_final(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
 
 	dispose_one_inode(inode);
 }
@@ -1479,7 +1470,7 @@ void iput(struct inode *inode)
 	if (inode) {
 		BUG_ON(inode->i_state & I_CLEAR);
 
-		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
 			iput_final(inode);
 	}
 }
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  4:23 ` [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache Dave Chinner
@ 2010-10-27  4:40   ` Al Viro
  2010-10-27  4:47       ` Eric Dumazet
  2010-10-27  9:12       ` Dave Chinner
  0 siblings, 2 replies; 26+ messages in thread
From: Al Viro @ 2010-10-27  4:40 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 03:23:04PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that inode state changes are protected by the inode->i_lock and
> the inode LRU manipulations by the inode_lru_lock, we can remove the
> inode_lock from prune_icache and the initial part of iput_final().
> 
> instead of using the inode_lock to protect the inode during
> iput_final, use the inode->i_lock instead. This protects the inode
> against new references being taken while we change the inode state
> to I_FREEING, as well as preventing prune_icache from grabbing the
> inode while we are manipulating it. Hence we no longer need the
> i???ode_lock in iput_final prior to setting I_FREEING on the inode.
  ^^^^^^^^^^^^

... the hell?  There's more such damage elsewhere in the thread; what's
going on?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  4:40   ` Al Viro
@ 2010-10-27  4:47       ` Eric Dumazet
  2010-10-27  9:12       ` Dave Chinner
  1 sibling, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2010-10-27  4:47 UTC (permalink / raw)
  To: Al Viro; +Cc: Dave Chinner, linux-fsdevel, linux-kernel

Le mercredi 27 octobre 2010 à 05:40 +0100, Al Viro a écrit :
> On Wed, Oct 27, 2010 at 03:23:04PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that inode state changes are protected by the inode->i_lock and
> > the inode LRU manipulations by the inode_lru_lock, we can remove the
> > inode_lock from prune_icache and the initial part of iput_final().
> > 
> > instead of using the inode_lock to protect the inode during
> > iput_final, use the inode->i_lock instead. This protects the inode
> > against new references being taken while we change the inode state
> > to I_FREEING, as well as preventing prune_icache from grabbing the
> > inode while we are manipulating it. Hence we no longer need the
> > i???ode_lock in iput_final prior to setting I_FREEING on the inode.
>   ^^^^^^^^^^^^
> 
> ... the hell?  There's more such damage elsewhere in the thread; what's
> going on?
> --

Maybe its on your side, no problem here on my copy.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
@ 2010-10-27  4:47       ` Eric Dumazet
  0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2010-10-27  4:47 UTC (permalink / raw)
  To: Al Viro; +Cc: Dave Chinner, linux-fsdevel, linux-kernel

Le mercredi 27 octobre 2010 à 05:40 +0100, Al Viro a écrit :
> On Wed, Oct 27, 2010 at 03:23:04PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that inode state changes are protected by the inode->i_lock and
> > the inode LRU manipulations by the inode_lru_lock, we can remove the
> > inode_lock from prune_icache and the initial part of iput_final().
> > 
> > instead of using the inode_lock to protect the inode during
> > iput_final, use the inode->i_lock instead. This protects the inode
> > against new references being taken while we change the inode state
> > to I_FREEING, as well as preventing prune_icache from grabbing the
> > inode while we are manipulating it. Hence we no longer need the
> > i???ode_lock in iput_final prior to setting I_FREEING on the inode.
>   ^^^^^^^^^^^^
> 
> ... the hell?  There's more such damage elsewhere in the thread; what's
> going on?
> --

Maybe its on your side, no problem here on my copy.


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  4:47       ` Eric Dumazet
  (?)
@ 2010-10-27  5:25       ` Al Viro
  2010-10-27  5:50           ` Eric Dumazet
  -1 siblings, 1 reply; 26+ messages in thread
From: Al Viro @ 2010-10-27  5:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Dave Chinner, linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 06:47:46AM +0200, Eric Dumazet wrote:
> Le mercredi 27 octobre 2010 ?? 05:40 +0100, Al Viro a ??crit :
> > On Wed, Oct 27, 2010 at 03:23:04PM +1100, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Now that inode state changes are protected by the inode->i_lock and
> > > the inode LRU manipulations by the inode_lru_lock, we can remove the
> > > inode_lock from prune_icache and the initial part of iput_final().
> > > 
> > > instead of using the inode_lock to protect the inode during
> > > iput_final, use the inode->i_lock instead. This protects the inode
> > > against new references being taken while we change the inode state
> > > to I_FREEING, as well as preventing prune_icache from grabbing the
> > > inode while we are manipulating it. Hence we no longer need the
> > > i???ode_lock in iput_final prior to setting I_FREEING on the inode.
> >   ^^^^^^^^^^^^
> > 
> > ... the hell?  There's more such damage elsewhere in the thread; what's
> > going on?
> > --
> 
> Maybe its on your side, no problem here on my copy.

"i\xe1\xb9\x89ode_lock", i.e. 'n' turned into U+1E49, aka "latin small letter
n with line below".  I doubt that it's MTA braindamage.

In the first patch there's 

- * invalidate_inodes    - attempt to free all inodes on a
+ * nvalidate_inodes    - attempt to free all inodes on a

and I _really_ doubt that anything in mail system is capable of something
that elaborate.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  5:25       ` Al Viro
@ 2010-10-27  5:50           ` Eric Dumazet
  0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2010-10-27  5:50 UTC (permalink / raw)
  To: Al Viro; +Cc: Dave Chinner, linux-fsdevel, linux-kernel

Le mercredi 27 octobre 2010 à 06:25 +0100, Al Viro a écrit :
> "i\xe1\xb9\x89ode_lock", i.e. 'n' turned into U+1E49, aka "latin small letter
> n with line below".  I doubt that it's MTA braindamage.
> 
> In the first patch there's 
> 
> - * invalidate_inodes    - attempt to free all inodes on a
> + * nvalidate_inodes    - attempt to free all inodes on a
> 
> and I _really_ doubt that anything in mail system is capable of something
> that elaborate.

Again, I can not see it in my copy, I checked lkml archives too :

http://lkml.org/lkml/2010/10/27/7

Mail was fine, maybe your file system is corrupted ?




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
@ 2010-10-27  5:50           ` Eric Dumazet
  0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2010-10-27  5:50 UTC (permalink / raw)
  To: Al Viro; +Cc: Dave Chinner, linux-fsdevel, linux-kernel

Le mercredi 27 octobre 2010 à 06:25 +0100, Al Viro a écrit :
> "i\xe1\xb9\x89ode_lock", i.e. 'n' turned into U+1E49, aka "latin small letter
> n with line below".  I doubt that it's MTA braindamage.
> 
> In the first patch there's 
> 
> - * invalidate_inodes    - attempt to free all inodes on a
> + * nvalidate_inodes    - attempt to free all inodes on a
> 
> and I _really_ doubt that anything in mail system is capable of something
> that elaborate.

Again, I can not see it in my copy, I checked lkml archives too :

http://lkml.org/lkml/2010/10/27/7

Mail was fine, maybe your file system is corrupted ?



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  5:50           ` Eric Dumazet
  (?)
@ 2010-10-27  6:01           ` Al Viro
  -1 siblings, 0 replies; 26+ messages in thread
From: Al Viro @ 2010-10-27  6:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Dave Chinner, linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 07:50:23AM +0200, Eric Dumazet wrote:
> Le mercredi 27 octobre 2010 ?? 06:25 +0100, Al Viro a ??crit :
> > "i\xe1\xb9\x89ode_lock", i.e. 'n' turned into U+1E49, aka "latin small letter
> > n with line below".  I doubt that it's MTA braindamage.
> > 
> > In the first patch there's 
> > 
> > - * invalidate_inodes    - attempt to free all inodes on a
> > + * nvalidate_inodes    - attempt to free all inodes on a
> > 
> > and I _really_ doubt that anything in mail system is capable of something
> > that elaborate.
> 
> Again, I can not see it in my copy, I checked lkml archives too :
> 
> http://lkml.org/lkml/2010/10/27/7
> 
> Mail was fine, maybe your file system is corrupted ?

fs corruption inserting pieces like that?  Then we have a serious trouble
of HAL kind...

It's not a result of rediff; it's plain vi /var/mail/$USER...

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  5:50           ` Eric Dumazet
  (?)
  (?)
@ 2010-10-27  6:09           ` Davidlohr Bueso
  -1 siblings, 0 replies; 26+ messages in thread
From: Davidlohr Bueso @ 2010-10-27  6:09 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Al Viro, Dave Chinner, linux-fsdevel, linux-kernel

On Wed, 2010-10-27 at 07:50 +0200, Eric Dumazet wrote:
> Le mercredi 27 octobre 2010 à 06:25 +0100, Al Viro a écrit :
> > "i\xe1\xb9\x89ode_lock", i.e. 'n' turned into U+1E49, aka "latin small letter
> > n with line below".  I doubt that it's MTA braindamage.
> > 
> > In the first patch there's 
> > 
> > - * invalidate_inodes    - attempt to free all inodes on a
> > + * nvalidate_inodes    - attempt to free all inodes on a
> > 
> > and I _really_ doubt that anything in mail system is capable of something
> > that elaborate.
> 
> Again, I can not see it in my copy, I checked lkml archives too :
> 
Neither can I, the patch is good (from a character point of view).


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/4] fs: protect inode->i_state with inode->i_lock
  2010-10-27  4:23 ` [PATCH 1/4] fs: protect inode->i_state with inode->i_lock Dave Chinner
@ 2010-10-27  7:07   ` Christian Stroetmann
  2010-10-27  8:58   ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Christian Stroetmann @ 2010-10-27  7:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

  Aloha;

some typos

On the 27.10.2010 06:23, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> Protect inode state transitions and validity checks with the
> inode->i_lock. This enables us to make inode state transitions
> independently of the inode_lock and is the first step to peeling
> away the inode_lock from the code.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---
>   fs/drop_caches.c       |    6 +++-
>   fs/fs-writeback.c      |   31 ++++++++++++++--
>   fs/inode.c             |   95 +++++++++++++++++++++++++++++++++++++++---------
>   fs/notify/inode_mark.c |   17 ++++++---
>   fs/quota/dquot.c       |    6 +++-
>   5 files changed, 127 insertions(+), 28 deletions(-)
>
> diff --git a/fs/drop_caches.c b/fs/drop_caches.c
> index 2195c21..c495fc3 100644
> --- a/fs/drop_caches.c
> +++ b/fs/drop_caches.c
> @@ -18,8 +18,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
>
>   	spin_lock(&inode_lock);
>   	list_for_each_entry(inode,&sb->s_inodes, i_sb_list) {
> -		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW))
> +		spin_lock(&inode->i_lock);
> +		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW)) {
> +			spin_unlock(&inode->i_lock);
>   			continue;
> +		}
> +		spin_unlock(&inode->i_lock);
>   		if (inode->i_mapping->nrpages == 0)
>   			continue;
>   		__iget(inode);
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index f6af81a..bd9204d 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -293,9 +293,11 @@ static void inode_wait_for_writeback(struct inode *inode)
>
>   	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
>   	 while (inode->i_state&  I_SYNC) {
> +		spin_unlock(&inode->i_lock);
>   		spin_unlock(&inode_lock);
>   		__wait_on_bit(wqh,&wq, inode_wait, TASK_UNINTERRUPTIBLE);
>   		spin_lock(&inode_lock);
> +		spin_lock(&inode->i_lock);
>   	}
>   }
>
> @@ -319,6 +321,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>   	unsigned dirty;
>   	int ret;
>
> +	spin_lock(&inode->i_lock);
>   	if (!atomic_read(&inode->i_count))
>   		WARN_ON(!(inode->i_state&  (I_WILL_FREE|I_FREEING)));
>   	else
> @@ -334,6 +337,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>   		 * completed a full scan of b_io.
>   		 */
>   		if (wbc->sync_mode != WB_SYNC_ALL) {
> +			spin_unlock(&inode->i_lock);
>   			requeue_io(inode);
>   			return 0;
>   		}
> @@ -349,6 +353,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>   	/* Set I_SYNC, reset I_DIRTY_PAGES */
>   	inode->i_state |= I_SYNC;
>   	inode->i_state&= ~I_DIRTY_PAGES;
> +	spin_unlock(&inode->i_lock);
>   	spin_unlock(&inode_lock);
>
>   	ret = do_writepages(mapping, wbc);
> @@ -370,8 +375,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>   	 * write_inode()
>   	 */
>   	spin_lock(&inode_lock);
> +	spin_lock(&inode->i_lock);
>   	dirty = inode->i_state&  I_DIRTY;
>   	inode->i_state&= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
> +	spin_unlock(&inode->i_lock);
>   	spin_unlock(&inode_lock);
>   	/* Don't write the inode if only I_DIRTY_PAGES was set */
end point?!
>   	if (dirty&  (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> @@ -381,6 +388,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>   	}
>
>   	spin_lock(&inode_lock);
> +	spin_lock(&inode->i_lock);
>   	inode->i_state&= ~I_SYNC;
>   	if (!(inode->i_state&  I_FREEING)) {
>   		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> @@ -422,6 +430,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>   		}
>   	}
>   	inode_sync_complete(inode);
> +	spin_unlock(&inode->i_lock);
>   	return ret;
>   }
>
> @@ -492,10 +501,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
>   		 * kind does not need peridic writeout yet, and for the latter
periodic
>   		* kind writeout is handled by the freer.
>   		 */
> +		spin_lock(&inode->i_lock);
>   		if (inode->i_state&  (I_NEW | I_FREEING | I_WILL_FREE)) {
> +			spin_unlock(&inode->i_lock);
>   			requeue_io(inode);
>   			continue;
>   		}
> +		spin_unlock(&inode->i_lock);
>
>   		/*
>   		 * Was this inode dirtied after sync_sb_inodes was called?
> @@ -681,7 +693,9 @@ static long wb_writeback(struct bdi_writeback *wb,
>   		if (!list_empty(&wb->b_more_io))  {
>   			inode = wb_inode(wb->b_more_io.prev);
>   			trace_wbc_writeback_wait(&wbc, wb->bdi);
> +			spin_lock(&inode->i_lock);
>   			inode_wait_for_writeback(inode);
> +			spin_unlock(&inode->i_lock);
>   		}
>   		spin_unlock(&inode_lock);
>   	}
> @@ -947,6 +961,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>   		block_dump___mark_inode_dirty(inode);
>
>   	spin_lock(&inode_lock);
> +	spin_lock(&inode->i_lock);
>   	if ((inode->i_state&  flags) != flags) {
>   		const int was_dirty = inode->i_state&  I_DIRTY;
>
> @@ -958,7 +973,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>   		 * superblock list, based upon its state.
>   		 */
>   		if (inode->i_state&  I_SYNC)
> -			goto out;
> +			goto out_unlock_inode;
>
>   		/*
>   		 * Only add valid (hashed) inodes to the superblock's
> @@ -966,11 +981,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>   		 */
>   		if (!S_ISBLK(inode->i_mode)) {
>   			if (inode_unhashed(inode))
> -				goto out;
> +				goto out_unlock_inode;
>   		}
>   		if (inode->i_state&  I_FREEING)
> -			goto out;
> +			goto out_unlock_inode;
>
> +		spin_unlock(&inode->i_lock);
>   		/*
>   		 * If the inode was already on b_dirty/b_io/b_more_io, don't
>   		 * reposition it (that would break b_dirty time-ordering).
> @@ -995,7 +1011,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>   			inode->dirtied_when = jiffies;
>   			list_move(&inode->i_wb_list,&bdi->wb.b_dirty);
>   		}
> +		goto out;
>   	}
> +out_unlock_inode:
> +	spin_unlock(&inode->i_lock);
>   out:
>   	spin_unlock(&inode_lock);
>
> @@ -1043,8 +1062,12 @@ static void wait_sb_inodes(struct super_block *sb)
>   	list_for_each_entry(inode,&sb->s_inodes, i_sb_list) {
>   		struct address_space *mapping;
>
> -		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW))
> +		spin_lock(&inode->i_lock);
> +		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW)) {
> +			spin_unlock(&inode->i_lock);
>   			continue;
> +		}
> +		spin_unlock(&inode->i_lock);
>   		mapping = inode->i_mapping;
>   		if (mapping->nrpages == 0)
>   			continue;
> diff --git a/fs/inode.c b/fs/inode.c
> index a6d6068..eaba6ce 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -26,6 +26,17 @@
>   #include<linux/posix_acl.h>
>
>   /*
> + * inode locking rules.
> + *
> + * inode->i_lock protects:
> + *   i_state
> + *
> + * Lock ordering:
> + * inode_lock
> + *   inode->i_lock
> + */
> +
> +/*
>    * This is needed for the following functions:
>    *  - inode_has_buffers
>    *  - invalidate_bdev
> @@ -429,7 +440,9 @@ void end_writeback(struct inode *inode)
>   	BUG_ON(!(inode->i_state&  I_FREEING));
>   	BUG_ON(inode->i_state&  I_CLEAR);
>   	inode_sync_wait(inode);
> +	spin_lock(&inode->i_lock);
>   	inode->i_state = I_FREEING | I_CLEAR;
> +	spin_unlock(&inode->i_lock);
>   }
>   EXPORT_SYMBOL(end_writeback);
>
> @@ -498,12 +511,17 @@ void evict_inodes(struct super_block *sb)
>   		if (atomic_read(&inode->i_count))
>   			continue;
>
> +		spin_lock(&inode->i_lock);
>   		if (inode->i_state&  (I_NEW | I_FREEING | I_WILL_FREE)) {
> +			spin_unlock(&inode->i_lock);
>   			WARN_ON(1);
>   			continue;
>   		}
>
>   		inode->i_state |= I_FREEING;
> +		if (!(inode->i_state&  (I_DIRTY | I_SYNC)))
> +			percpu_counter_dec(&nr_inodes_unused);
> +		spin_unlock(&inode->i_lock);
>
>   		/*
>   		 * Move the inode off the IO lists and LRU once I_FREEING is
> @@ -511,8 +529,6 @@ void evict_inodes(struct super_block *sb)
>   		 */
>   		list_move(&inode->i_lru,&dispose);
>   		list_del_init(&inode->i_wb_list);
> -		if (!(inode->i_state&  (I_DIRTY | I_SYNC)))
> -			percpu_counter_dec(&nr_inodes_unused);
>   	}
>   	spin_unlock(&inode_lock);
>
> @@ -521,7 +537,7 @@ void evict_inodes(struct super_block *sb)
>   }
>
>   /**
> - * invalidate_inodes	- attempt to free all inodes on a superblock
> + * nvalidate_inodes	- attempt to free all inodes on a superblock
attempts
invalidate_inodes
>    * @sb:		superblock to operate on
>    *
>    * Attempts to free all inodes for a given superblock.  If there were any
> @@ -537,14 +553,21 @@ int invalidate_inodes(struct super_block *sb)
>
>   	spin_lock(&inode_lock);
>   	list_for_each_entry_safe(inode, next,&sb->s_inodes, i_sb_list) {
> -		if (inode->i_state&  (I_NEW | I_FREEING | I_WILL_FREE))
> +		spin_lock(&inode->i_lock);
> +		if (inode->i_state&  (I_NEW | I_FREEING | I_WILL_FREE)) {
> +			spin_unlock(&inode->i_lock);
>   			continue;
> +		}
>   		if (atomic_read(&inode->i_count)) {
> +			spin_unlock(&inode->i_lock);
>   			busy = 1;
>   			continue;
>   		}
>
>   		inode->i_state |= I_FREEING;
> +		if (!(inode->i_state&  (I_DIRTY | I_SYNC)))
> +			percpu_counter_dec(&nr_inodes_unused);
> +		spin_unlock(&inode->i_lock);
>
>   		/*
>   		 * Move the inode off the IO lists and LRU once I_FREEING is
> @@ -552,8 +575,6 @@ int invalidate_inodes(struct super_block *sb)
>   		 */
>   		list_move(&inode->i_lru,&dispose);
>   		list_del_init(&inode->i_wb_list);
> -		if (!(inode->i_state&  (I_DIRTY | I_SYNC)))
> -			percpu_counter_dec(&nr_inodes_unused);
>   	}
>   	spin_unlock(&inode_lock);
>
> @@ -612,8 +633,10 @@ static void prune_icache(int nr_to_scan)
>   		 * Referenced or dirty inodes are still in use. Give them
>   		 * another pass through the LRU as we canot reclaim them now.
cannot
>   		*/
> +		spin_lock(&inode->i_lock);
>   		if (atomic_read(&inode->i_count) ||
>   		    (inode->i_state&  ~I_REFERENCED)) {
> +			spin_unlock(&inode->i_lock);
>   			list_del_init(&inode->i_lru);
>   			percpu_counter_dec(&nr_inodes_unused);
>   			continue;
> @@ -621,12 +644,14 @@ static void prune_icache(int nr_to_scan)
>
>   		/* recently referenced inodes get one more pass */
>   		if (inode->i_state&  I_REFERENCED) {
> -			list_move(&inode->i_lru,&inode_lru);
>   			inode->i_state&= ~I_REFERENCED;
> +			spin_unlock(&inode->i_lock);
> +			list_move(&inode->i_lru,&inode_lru);
>   			continue;
>   		}
>   		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
>   			__iget(inode);
> +			spin_unlock(&inode->i_lock);
>   			spin_unlock(&inode_lock);
>   			if (remove_inode_buffers(inode))
>   				reap += invalidate_mapping_pages(&inode->i_data,
> @@ -637,11 +662,15 @@ static void prune_icache(int nr_to_scan)
>   			if (inode != list_entry(inode_lru.next,
>   						struct inode, i_lru))
>   				continue;	/* wrong inode or list_empty */
> -			if (!can_unuse(inode))
> +			spin_lock(&inode->i_lock);
> +			if (!can_unuse(inode)) {
> +				spin_unlock(&inode->i_lock);
>   				continue;
> +			}
>   		}
>   		WARN_ON(inode->i_state&  I_NEW);
>   		inode->i_state |= I_FREEING;
> +		spin_unlock(&inode->i_lock);
>
>   		/*
>   		 * Move the inode off the IO lists and LRU once I_FREEING is
> @@ -708,10 +737,12 @@ repeat:
>   			continue;
>   		if (!test(inode, data))
>   			continue;
> +		spin_lock(&inode->i_lock);
>   		if (inode->i_state&  (I_FREEING|I_WILL_FREE)) {
>   			__wait_on_freeing_inode(inode);
>   			goto repeat;
>   		}
> +		spin_unlock(&inode->i_lock);
>   		__iget(inode);
>   		return inode;
>   	}
> @@ -734,10 +765,12 @@ repeat:
>   			continue;
>   		if (inode->i_sb != sb)
>   			continue;
> +		spin_lock(&inode->i_lock);
>   		if (inode->i_state&  (I_FREEING|I_WILL_FREE)) {
>   			__wait_on_freeing_inode(inode);
>   			goto repeat;
>   		}
> +		spin_unlock(&inode->i_lock);
>   		__iget(inode);
>   		return inode;
>   	}
> @@ -803,8 +836,10 @@ struct inode *new_inode(struct super_block *sb)
>   	inode = alloc_inode(sb);
>   	if (inode) {
>   		spin_lock(&inode_lock);
> -		__inode_sb_list_add(inode);
> +		spin_lock(&inode->i_lock);
>   		inode->i_state = 0;
> +		spin_unlock(&inode->i_lock);
> +		__inode_sb_list_add(inode);
>   		spin_unlock(&inode_lock);
>   	}
>   	return inode;
> @@ -871,9 +906,11 @@ static struct inode *get_new_inode(struct super_block *sb,
>   			if (set(inode, data))
>   				goto set_failed;
>
> +			spin_lock(&inode->i_lock);
> +			inode->i_state = I_NEW;
> +			spin_unlock(&inode->i_lock);
>   			hlist_add_head(&inode->i_hash, head);
>   			__inode_sb_list_add(inode);
> -			inode->i_state = I_NEW;
>   			spin_unlock(&inode_lock);
>
>   			/* Return the locked inode with I_NEW set, the
> @@ -917,10 +954,12 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
>   		/* We released the lock, so.. */
so what? :-)
>   		old = find_inode_fast(sb, head, ino);
>   		if (!old) {
> +			spin_lock(&inode->i_lock);
>   			inode->i_ino = ino;
> +			inode->i_state = I_NEW;
> +			spin_unlock(&inode->i_lock);
>   			hlist_add_head(&inode->i_hash, head);
>   			__inode_sb_list_add(inode);
> -			inode->i_state = I_NEW;
>   			spin_unlock(&inode_lock);
>
>   			/* Return the locked inode with I_NEW set, the
> @@ -1005,15 +1044,19 @@ EXPORT_SYMBOL(iunique);
>   struct inode *igrab(struct inode *inode)
>   {
>   	spin_lock(&inode_lock);
> -	if (!(inode->i_state&  (I_FREEING|I_WILL_FREE)))
> +	spin_lock(&inode->i_lock);
> +	if (!(inode->i_state&  (I_FREEING|I_WILL_FREE))) {
>   		__iget(inode);
> -	else
> +		spin_unlock(&inode->i_lock);
> +	} else {
> +		spin_unlock(&inode->i_lock);
>   		/*
>   		 * Handle the case where s_op->clear_inode is not been
>   		 * called yet, and somebody is calling igrab
>   		 * while the inode is getting freed.
>   		 */
>   		inode = NULL;
> +	}
>   	spin_unlock(&inode_lock);
>   	return inode;
>   }
> @@ -1242,7 +1285,9 @@ int insert_inode_locked(struct inode *inode)
>   	ino_t ino = inode->i_ino;
>   	struct hlist_head *head = inode_hashtable + hash(sb, ino);
>
> +	spin_lock(&inode->i_lock);
>   	inode->i_state |= I_NEW;
> +	spin_unlock(&inode->i_lock);
>   	while (1) {
>   		struct hlist_node *node;
>   		struct inode *old = NULL;
> @@ -1252,8 +1297,11 @@ int insert_inode_locked(struct inode *inode)
>   				continue;
>   			if (old->i_sb != sb)
>   				continue;
> -			if (old->i_state&  (I_FREEING|I_WILL_FREE))
> +			spin_lock(&old->i_lock);
> +			if (old->i_state&  (I_FREEING|I_WILL_FREE)) {
> +				spin_unlock(&old->i_lock);
>   				continue;
> +			}
>   			break;
>   		}
>   		if (likely(!node)) {
> @@ -1261,6 +1309,7 @@ int insert_inode_locked(struct inode *inode)
>   			spin_unlock(&inode_lock);
>   			return 0;
>   		}
> +		spin_unlock(&old->i_lock);
>   		__iget(old);
>   		spin_unlock(&inode_lock);
>   		wait_on_inode(old);
> @@ -1279,7 +1328,9 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
>   	struct super_block *sb = inode->i_sb;
>   	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
>
> +	spin_lock(&inode->i_lock);
>   	inode->i_state |= I_NEW;
> +	spin_unlock(&inode->i_lock);
>
>   	while (1) {
>   		struct hlist_node *node;
> @@ -1291,8 +1342,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
>   				continue;
>   			if (!test(old, data))
>   				continue;
> -			if (old->i_state&  (I_FREEING|I_WILL_FREE))
> +			spin_lock(&old->i_lock);
> +			if (old->i_state&  (I_FREEING|I_WILL_FREE)) {
> +				spin_unlock(&old->i_lock);
>   				continue;
> +			}
>   			break;
>   		}
>   		if (likely(!node)) {
> @@ -1300,6 +1354,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
>   			spin_unlock(&inode_lock);
>   			return 0;
>   		}
> +		spin_unlock(&old->i_lock);
>   		__iget(old);
>   		spin_unlock(&inode_lock);
>   		wait_on_inode(old);
> @@ -1346,6 +1401,9 @@ static void iput_final(struct inode *inode)
>   	const struct super_operations *op = inode->i_sb->s_op;
>   	int drop;
>
> +	spin_lock(&inode->i_lock);
> +	WARN_ON(inode->i_state&  I_NEW);
> +
>   	if (op&&  op->drop_inode)
>   		drop = op->drop_inode(inode);
>   	else
> @@ -1357,21 +1415,23 @@ static void iput_final(struct inode *inode)
>   			if (!(inode->i_state&  (I_DIRTY|I_SYNC))) {
>   				inode_lru_list_add(inode);
>   			}
> +			spin_unlock(&inode->i_lock);
>   			spin_unlock(&inode_lock);
>   			return;
>   		}
> -		WARN_ON(inode->i_state&  I_NEW);
>   		inode->i_state |= I_WILL_FREE;
> +		spin_unlock(&inode->i_lock);
>   		spin_unlock(&inode_lock);
>   		write_inode_now(inode, 1);
>   		spin_lock(&inode_lock);
> +		spin_lock(&inode->i_lock);
>   		WARN_ON(inode->i_state&  I_NEW);
>   		inode->i_state&= ~I_WILL_FREE;
>   		__remove_inode_hash(inode);
>   	}
>
> -	WARN_ON(inode->i_state&  I_NEW);
>   	inode->i_state |= I_FREEING;
> +	spin_unlock(&inode->i_lock);
>
>   	/*
>   	 * Move the inode off the IO lists and LRU once I_FREEING is
> @@ -1592,6 +1652,7 @@ static void __wait_on_freeing_inode(struct inode *inode)
>   	DEFINE_WAIT_BIT(wait,&inode->i_state, __I_NEW);
>   	wq = bit_waitqueue(&inode->i_state, __I_NEW);
>   	prepare_to_wait(wq,&wait.wait, TASK_UNINTERRUPTIBLE);
> +	spin_unlock(&inode->i_lock);
>   	spin_unlock(&inode_lock);
>   	schedule();
>   	finish_wait(wq,&wait.wait);
> diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> index 21ed106..08f0d16 100644
> --- a/fs/notify/inode_mark.c
> +++ b/fs/notify/inode_mark.c
> @@ -249,8 +249,12 @@ void fsnotify_unmount_inodes(struct list_head *list)
>   		 * I_WILL_FREE, or I_NEW which is fine because by that point
>   		 * the inode cannot have any associated watches.
>   		 */
> -		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW))
> +		spin_lock(&inode->i_lock);
> +		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW)) {
> +			spin_unlock(&inode->i_lock);
>   			continue;
> +		}
> +		spin_unlock(&inode->i_lock);
>
>   		/*
>   		 * If i_count is zero, the inode cannot have any watches and
, then the inode
> @@ -272,10 +276,13 @@ void fsnotify_unmount_inodes(struct list_head *list)
>
>   		/* In case the dropping of a reference would nuke next_i. */
>   		if ((&next_i->i_sb_list != list)&&
> -		    atomic_read(&next_i->i_count)&&
> -		    !(next_i->i_state&  (I_FREEING | I_WILL_FREE))) {
> -			__iget(next_i);
> -			need_iput = next_i;
> +		    atomic_read(&next_i->i_count)) {
> +			spin_lock(&next_i->i_lock);
> +			if (!(next_i->i_state&  (I_FREEING | I_WILL_FREE))) {
> +				__iget(next_i);
> +				need_iput = next_i;
> +			}
> +			spin_unlock(&next_i->i_lock);
>   		}
>
>   		/*
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index aad1316..fb7c2c0 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -898,8 +898,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
>
>   	spin_lock(&inode_lock);
>   	list_for_each_entry(inode,&sb->s_inodes, i_sb_list) {
> -		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW))
> +		spin_lock(&inode->i_lock);
> +		if (inode->i_state&  (I_FREEING|I_WILL_FREE|I_NEW)) {
> +			spin_unlock(&inode->i_lock);
>   			continue;
> +		}
> +		spin_unlock(&inode->i_lock);
>   #ifdef CONFIG_QUOTA_DEBUG
>   		if (unlikely(inode_get_rsv_space(inode)>  0))
>   			reserved = 1;


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-27  4:23 ` [PATCH 3/4] fs: Lock the inode LRU list separately Dave Chinner
@ 2010-10-27  7:08   ` Christian Stroetmann
  2010-10-27  9:05   ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Christian Stroetmann @ 2010-10-27  7:08 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

  some typos

On the 27.10.2010 06:23, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> Introduce the inode_lru_lock to protect the inode_lru list. This
> lock is nested inside the inode->i_lock to allow the inode to be
> added to the LRU list in iput_final without needing to deal with
> lock inversions. This keeps iput_final() clean and neat.
>
> Further, where marking the inode I_FREEING and removing it from the
> LRU, move the LRU list manipulation within the inode->i_lock to keep
> the list manipulation consistent with iput_final. This also means
> that most of the open coded LRU list removal + unused inode
> accounting can now use the inode_lru_list_del() wrappers which
> cleans the code up further.
>
> However, this locking change means what the LRU traversal in
> prune_icache() inverts this lock ordering and needs to use trylock
> semantics on the inode->i_lock to avoid deadlocking. In these cases,
> if we fail to lock the inode we move it to the back of the LRU to
> prevent spinning on it.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---
>   fs/inode.c |   45 +++++++++++++++++++++++++++++----------------
>   1 files changed, 29 insertions(+), 16 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index f134aa4..e04371e 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -30,10 +30,13 @@
>    *
>    * inode->i_lock protects:
>    *   i_state
> + * inode_lru_lock protects:
> + *   inode_lru, i_lru
>    *
>    * Lock ordering:
>    * inode_lock
>    *   inode->i_lock
> + *     inode_lru_lock
>    */
>
>   /*
> @@ -83,6 +86,7 @@ static unsigned int i_hash_shift __read_mostly;
>    */
>
>   static LIST_HEAD(inode_lru);
> +static DEFINE_SPINLOCK(inode_lru_lock);
>   static struct hlist_head *inode_hashtable __read_mostly;
>
>   /*
> @@ -344,16 +348,20 @@ EXPORT_SYMBOL(ihold);
>   static void inode_lru_list_add(struct inode *inode)
>   {
>   	if (list_empty(&inode->i_lru)) {
> +		spin_lock(&inode_lru_lock);
>   		list_add(&inode->i_lru,&inode_lru);
>   		percpu_counter_inc(&nr_inodes_unused);
> +		spin_unlock(&inode_lru_lock);
>   	}
>   }
>
>   static void inode_lru_list_del(struct inode *inode)
>   {
>   	if (!list_empty(&inode->i_lru)) {
> +		spin_lock(&inode_lru_lock);
>   		list_del_init(&inode->i_lru);
>   		percpu_counter_dec(&nr_inodes_unused);
> +		spin_unlock(&inode_lru_lock);
>   	}
>   }
>
> @@ -537,15 +545,10 @@ void evict_inodes(struct super_block *sb)
>   		}
>
>   		inode->i_state |= I_FREEING;
> -		if (!(inode->i_state&  (I_DIRTY | I_SYNC)))
> -			percpu_counter_dec(&nr_inodes_unused);
> +		inode_lru_list_del(inode);
>   		spin_unlock(&inode->i_lock);
>
> -		/*
> -		 * Move the inode off the IO lists and LRU once I_FREEING is
> -		 * set so that it won't get moved back on there if it is dirty.
> -		 */
> -		list_move(&inode->i_lru,&dispose);
> +		list_add(&inode->i_lru,&dispose);
>   	}
>   	spin_unlock(&inode_lock);
>
> @@ -582,15 +585,10 @@ int invalidate_inodes(struct super_block *sb)
>   		}
>
>   		inode->i_state |= I_FREEING;
> -		if (!(inode->i_state&  (I_DIRTY | I_SYNC)))
> -			percpu_counter_dec(&nr_inodes_unused);
> +		inode_lru_list_del(inode);
>   		spin_unlock(&inode->i_lock);
>
> -		/*
> -		 * Move the inode off the IO lists and LRU once I_FREEING is
> -		 * set so that it won't get moved back on there if it is dirty.
> -		 */
> -		list_move(&inode->i_lru,&dispose);
> +		list_add(&inode->i_lru,&dispose);
>   	}
>   	spin_unlock(&inode_lock);
>
> @@ -637,6 +635,7 @@ static void prune_icache(int nr_to_scan)
>
>   	down_read(&iprune_sem);
>   	spin_lock(&inode_lock);
> +	spin_lock(&inode_lru_lock);
>   	for (nr_scanned = 0; nr_scanned<  nr_to_scan; nr_scanned++) {
>   		struct inode *inode;
>
> @@ -646,10 +645,19 @@ static void prune_icache(int nr_to_scan)
>   		inode = list_entry(inode_lru.prev, struct inode, i_lru);
>
>   		/*
> +		 * we are inverting the inode_lru_lock/inode->i_lock here,
> +		 * so use a trylock. If we fail to get the lock, just move the
, then just move
or
, then move
> +		 * inode to the back of the list so we don't spin on it.
> +		 */
> +		if (!spin_trylock(&inode->i_lock)) {
> +			list_move(&inode->i_lru,&inode_lru);
> +			continue;
> +		}
> +
> +		/*
>   		 * Referenced or dirty inodes are still in use. Give them
>   		 * another pass through the LRU as we canot reclaim them now.
cannot
>   		*/
> -		spin_lock(&inode->i_lock);
>   		if (atomic_read(&inode->i_count) ||
>   		    (inode->i_state&  ~I_REFERENCED)) {
>   			spin_unlock(&inode->i_lock);
> @@ -668,17 +676,21 @@ static void prune_icache(int nr_to_scan)
>   		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
>   			__iget(inode);
>   			spin_unlock(&inode->i_lock);
> +			spin_unlock(&inode_lru_lock);
>   			spin_unlock(&inode_lock);
>   			if (remove_inode_buffers(inode))
>   				reap += invalidate_mapping_pages(&inode->i_data,
>   								0, -1);
>   			iput(inode);
>   			spin_lock(&inode_lock);
> +			spin_lock(&inode_lru_lock);
>
>   			if (inode != list_entry(inode_lru.next,
>   						struct inode, i_lru))
>   				continue;	/* wrong inode or list_empty */
> -			spin_lock(&inode->i_lock);
> +			/* avoid lock inversions with trylock */
> +			if (!spin_trylock(&inode->i_lock))
> +				continue;
>   			if (!can_unuse(inode)) {
>   				spin_unlock(&inode->i_lock);
>   				continue;
> @@ -699,6 +711,7 @@ static void prune_icache(int nr_to_scan)
>   		__count_vm_events(KSWAPD_INODESTEAL, reap);
>   	else
>   		__count_vm_events(PGINODESTEAL, reap);
> +	spin_unlock(&inode_lru_lock);
>   	spin_unlock(&inode_lock);
>
>   	dispose_list(&freeable);


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  5:50           ` Eric Dumazet
                             ` (2 preceding siblings ...)
  (?)
@ 2010-10-27  7:11           ` Christian Stroetmann
  -1 siblings, 0 replies; 26+ messages in thread
From: Christian Stroetmann @ 2010-10-27  7:11 UTC (permalink / raw)
  To: Eric Dumazet, Al Viro, linux-fsdevel, linux-kernel

  On the 27.10.2010 07:50, Eric Dumazet wrote :
> Le mercredi 27 octobre 2010 à 06:25 +0100, Al Viro a écrit :
>> "i\xe1\xb9\x89ode_lock", i.e. 'n' turned into U+1E49, aka "latin small letter
>> n with line below".  I doubt that it's MTA braindamage.
>>
>> In the first patch there's
>>
>> - * invalidate_inodes    - attempt to free all inodes on a
>> + * nvalidate_inodes    - attempt to free all inodes on a
>>
>> and I _really_ doubt that anything in mail system is capable of something
>> that elaborate.
> Again, I can not see it in my copy, I checked lkml archives too :
>
> http://lkml.org/lkml/2010/10/27/7
>
> Mail was fine, maybe your file system is corrupted ?
>
I have it in patch 4/4 too.
But in patch 1/4 could it be just a typo?

Christian Stroetmann

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/4] fs: factor inode disposal
  2010-10-27  4:23 ` [PATCH 2/4] fs: factor inode disposal Dave Chinner
@ 2010-10-27  7:55   ` Christoph Hellwig
  2010-10-27  9:06   ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2010-10-27  7:55 UTC (permalink / raw)
  To: Dave Chinner; +Cc: viro, linux-fsdevel, linux-kernel

>  /*
> + * Free the inode passed in, removing it from the lists it is still connected
> + * to but avoiding unnecessary lock round-trips for the lists it is no longer
> + * on.
> + *
> + * An inode must already be marked I_FREEING so that we avoid the inode being
> + * moved back onto lists if we race with other code that manipulates the lists
> + * (e.g. writeback_single_inode_inode). The caller is responsisble for setting this.

Too long line.

> + */
> +static void dispose_one_inode(struct inode *inode)
> +{
> +	BUG_ON(!(inode->i_state & I_FREEING));
> +
> +	spin_lock(&inode_lock);
> +	list_del_init(&inode->i_wb_list);
> +	__remove_inode_hash(inode);
> +	__inode_sb_list_del(inode);
> +	spin_unlock(&inode_lock);
> +
> +	evict(inode);
> +
> +	wake_up_inode(inode);
> +	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
> +	destroy_inode(inode);
> +}

As this is the only caller of evict left I think the code should just
be added to evict instead of a new function.  Also the hash removal
should happen after evict, so that __wait_on_freeing_inode still works.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/4] fs: protect inode->i_state with inode->i_lock
  2010-10-27  4:23 ` [PATCH 1/4] fs: protect inode->i_state with inode->i_lock Dave Chinner
  2010-10-27  7:07   ` Christian Stroetmann
@ 2010-10-27  8:58   ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2010-10-27  8:58 UTC (permalink / raw)
  To: Dave Chinner; +Cc: viro, linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 03:23:01PM +1100, Dave Chinner wrote:
>  	spin_lock(&inode_lock);
>  	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> -		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
> +		spin_lock(&inode->i_lock);
> +		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> +			spin_unlock(&inode->i_lock);
>  			continue;
> +		}
> +		spin_unlock(&inode->i_lock);
>  		if (inode->i_mapping->nrpages == 0)
>  			continue;
>  		__iget(inode);

If you want to remove inode_lock from the lru scanning later you already
need to extend i_lock coverage to include __iget here.  Otherwise we
could race to mark the inode as I_FREEING or I_WILL_FREE before we
grabbed a reference after your patchset. 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-27  4:23 ` [PATCH 3/4] fs: Lock the inode LRU list separately Dave Chinner
  2010-10-27  7:08   ` Christian Stroetmann
@ 2010-10-27  9:05   ` Christoph Hellwig
  2010-10-27 22:24     ` Dave Chinner
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2010-10-27  9:05 UTC (permalink / raw)
  To: Dave Chinner; +Cc: viro, linux-fsdevel, linux-kernel

> @@ -30,10 +30,13 @@
>   *
>   * inode->i_lock protects:
>   *   i_state
> + * inode_lru_lock protects:
> + *   inode_lru, i_lru
>   *
>   * Lock ordering:
>   * inode_lock
>   *   inode->i_lock
> + *     inode_lru_lock
>   */

Always writing the inode fields as inode->i_foo might be better.

> @@ -537,15 +545,10 @@ void evict_inodes(struct super_block *sb)
>  		}
>  
>  		inode->i_state |= I_FREEING;
> -		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
> -			percpu_counter_dec(&nr_inodes_unused);
> +		inode_lru_list_del(inode);
>  		spin_unlock(&inode->i_lock);
>  
> -		/*
> -		 * Move the inode off the IO lists and LRU once I_FREEING is
> -		 * set so that it won't get moved back on there if it is dirty.
> -		 */
> -		list_move(&inode->i_lru, &dispose);
> +		list_add(&inode->i_lru, &dispose);
>  	}
>  	spin_unlock(&inode_lock);
>  
> @@ -582,15 +585,10 @@ int invalidate_inodes(struct super_block *sb)
>  		}
>  
>  		inode->i_state |= I_FREEING;
> -		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
> -			percpu_counter_dec(&nr_inodes_unused);
> +		inode_lru_list_del(inode);
>  		spin_unlock(&inode->i_lock);

with this scheme we now decrement nr_inodes_unused twice - once in
invalidate_inodes/evict_inodes and once in dispose_one_inode.  I think
you just want to use an opencoded list_move under the lru lock to
move the inode to the temporary list for now, similar to what
prune_icache does.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/4] fs: factor inode disposal
  2010-10-27  4:23 ` [PATCH 2/4] fs: factor inode disposal Dave Chinner
  2010-10-27  7:55   ` Christoph Hellwig
@ 2010-10-27  9:06   ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2010-10-27  9:06 UTC (permalink / raw)
  To: Dave Chinner; +Cc: viro, linux-fsdevel, linux-kernel

> +	if (!drop && (sb->s_flags & MS_ACTIVE)) {
> +		inode->i_state |= I_REFERENCED;
> +		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
> +			inode_lru_list_add(inode);
> +		spin_unlock(&inode->i_lock);
> +		spin_unlock(&inode_lock);
> +		return;
> +	}
> +
>  	if (!drop) {
> -		if (sb->s_flags & MS_ACTIVE) {
> -			inode->i_state |= I_REFERENCED;
> -			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
> -				inode_lru_list_add(inode);
> -			}
> -			spin_unlock(&inode->i_lock);
> -			spin_unlock(&inode_lock);
> -			return;
> -		}

Btw, I'm really not sure what this change buys us.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
  2010-10-27  4:40   ` Al Viro
@ 2010-10-27  9:12       ` Dave Chinner
  2010-10-27  9:12       ` Dave Chinner
  1 sibling, 0 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  9:12 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 05:40:38AM +0100, Al Viro wrote:
> On Wed, Oct 27, 2010 at 03:23:04PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that inode state changes are protected by the inode->i_lock and
> > the inode LRU manipulations by the inode_lru_lock, we can remove the
> > inode_lock from prune_icache and the initial part of iput_final().
> > 
> > instead of using the inode_lock to protect the inode during
> > iput_final, use the inode->i_lock instead. This protects the inode
> > against new references being taken while we change the inode state
> > to I_FREEING, as well as preventing prune_icache from grabbing the
> > inode while we are manipulating it. Hence we no longer need the
> > i???ode_lock in iput_final prior to setting I_FREEING on the inode.
>   ^^^^^^^^^^^^
> 
> ... the hell?  There's more such damage elsewhere in the thread; what's
> going on?

vim utf-8 multibyte support that is causing these characters to be
created. e.g.  e<backspace>' results in é. sometimes the resultant
character looks almost identical and so I didn't notice. I haven't
found the magic recipe to turn this off (maxcombine=0 doesn't seem
to stop it) or change the special compose sequence, so I'll keep
looking.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache
@ 2010-10-27  9:12       ` Dave Chinner
  0 siblings, 0 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27  9:12 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 05:40:38AM +0100, Al Viro wrote:
> On Wed, Oct 27, 2010 at 03:23:04PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that inode state changes are protected by the inode->i_lock and
> > the inode LRU manipulations by the inode_lru_lock, we can remove the
> > inode_lock from prune_icache and the initial part of iput_final().
> > 
> > instead of using the inode_lock to protect the inode during
> > iput_final, use the inode->i_lock instead. This protects the inode
> > against new references being taken while we change the inode state
> > to I_FREEING, as well as preventing prune_icache from grabbing the
> > inode while we are manipulating it. Hence we no longer need the
> > i???ode_lock in iput_final prior to setting I_FREEING on the inode.
>   ^^^^^^^^^^^^
> 
> ... the hell?  There's more such damage elsewhere in the thread; what's
> going on?

vim utf-8 multibyte support that is causing these characters to be
created. e.g.  e<backspace>' results in é. sometimes the resultant
character looks almost identical and so I didn't notice. I haven't
found the magic recipe to turn this off (maxcombine=0 doesn't seem
to stop it) or change the special compose sequence, so I'll keep
looking.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-27  9:05   ` Christoph Hellwig
@ 2010-10-27 22:24     ` Dave Chinner
  2010-10-28 10:25       ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: Dave Chinner @ 2010-10-27 22:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: viro, linux-fsdevel, linux-kernel

On Wed, Oct 27, 2010 at 05:05:30AM -0400, Christoph Hellwig wrote:
> > @@ -537,15 +545,10 @@ void evict_inodes(struct super_block *sb)
> >  		}
> >  
> >  		inode->i_state |= I_FREEING;
> > -		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
> > -			percpu_counter_dec(&nr_inodes_unused);
> > +		inode_lru_list_del(inode);
> >  		spin_unlock(&inode->i_lock);
> >  
> > -		/*
> > -		 * Move the inode off the IO lists and LRU once I_FREEING is
> > -		 * set so that it won't get moved back on there if it is dirty.
> > -		 */
> > -		list_move(&inode->i_lru, &dispose);
> > +		list_add(&inode->i_lru, &dispose);
> >  	}
> >  	spin_unlock(&inode_lock);
> >  
> > @@ -582,15 +585,10 @@ int invalidate_inodes(struct super_block *sb)
> >  		}
> >  
> >  		inode->i_state |= I_FREEING;
> > -		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
> > -			percpu_counter_dec(&nr_inodes_unused);
> > +		inode_lru_list_del(inode);
> >  		spin_unlock(&inode->i_lock);
> 
> with this scheme we now decrement nr_inodes_unused twice - once in
> invalidate_inodes/evict_inodes and once in dispose_one_inode. 

That doesn't happen because the counter is only modified when
the inode is moved on/off the list and there are checks to avoid
removing an inode that is not on the list. Also, the inode is not
removed from the LRU in dispose_one_inode - it is always done when
the inode is marked I_FREEING while the i_lock is held before
calling dispose_one_inode().

Basically I wanted to remove the strange "inode is not on the LRU if
it is dirty or under writeback" accounting checks and make the
accounting symmetric with adding/removing the inodes from the LRU.
These are protected by list_empty() checks, so should always end up
with the correct accounting.

hence the only special case now is prune_icache() which already
holds the inode_lru_lock() so can't call the helper. Besides, we
don't need any checks there because we know the inode is on the LRU
already....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-27 23:02 fs: break out inode LRU operations from inode_lock V2 Dave Chinner
@ 2010-10-27 23:02 ` Dave Chinner
  0 siblings, 0 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-27 23:02 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, linux-kernel

From: Dave Chinner <dchinner@redhat.com>

Introduce the inode_lru_lock to protect the inode_lru list. This
lock is nested inside the inode->i_lock to allow the inode to be
added to the LRU list in iput_final without needing to deal with
lock inversions. This keeps iput_final() clean and neat.

Further, where marking the inode I_FREEING and removing it from the
LRU, move the LRU list manipulation within the inode->i_lock to keep
the list manipulation consistent with iput_final. This also means
that most of the open coded LRU list removal + unused inode
accounting can now use the inode_lru_list_del() wrappers which
cleans the code up further.

However, this locking change means what the LRU traversal in
prune_icache() inverts this lock ordering and needs to use trylock
semantics on the inode->i_lock to avoid deadlocking. In these cases,
if we fail to lock the inode we move it to the back of the LRU to
prevent spinning on it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/inode.c |   45 +++++++++++++++++++++++++++++----------------
 1 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index bd1688b..da741e7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -30,10 +30,13 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, __iget()
+ * inode_lru_lock protects:
+ *   inode_lru, inode->i_lru
  *
  * Lock ordering:
  * inode_lock
  *   inode->i_lock
+ *     inode_lru_lock
  */
 
 /*
@@ -83,6 +86,7 @@ static unsigned int i_hash_shift __read_mostly;
  */
 
 static LIST_HEAD(inode_lru);
+static DEFINE_SPINLOCK(inode_lru_lock);
 static struct hlist_head *inode_hashtable __read_mostly;
 
 /*
@@ -343,18 +347,22 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
+	spin_lock(&inode_lru_lock);
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
 		percpu_counter_inc(&nr_inodes_unused);
 	}
+	spin_unlock(&inode_lru_lock);
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
+	spin_lock(&inode_lru_lock);
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
 		percpu_counter_dec(&nr_inodes_unused);
 	}
+	spin_unlock(&inode_lru_lock);
 }
 
 static inline void __inode_sb_list_add(struct inode *inode)
@@ -521,15 +529,10 @@ void evict_inodes(struct super_block *sb)
 		}
 
 		inode->i_state |= I_FREEING;
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
 
-		/*
-		 * Move the inode off the LRU once I_FREEING is set so that it
-		 * won't get moved back on.
-		 */
-		list_move(&inode->i_lru, &dispose);
+		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_lock);
 
@@ -566,15 +569,10 @@ int invalidate_inodes(struct super_block *sb)
 		}
 
 		inode->i_state |= I_FREEING;
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
 
-		/*
-		 * Move the inode off the LRU once I_FREEING is
-		 * set so that it won't get moved back on.
-		 */
-		list_move(&inode->i_lru, &dispose);
+		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_lock);
 
@@ -621,6 +619,7 @@ static void prune_icache(int nr_to_scan)
 
 	down_read(&iprune_sem);
 	spin_lock(&inode_lock);
+	spin_lock(&inode_lru_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
 
@@ -630,10 +629,19 @@ static void prune_icache(int nr_to_scan)
 		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 
 		/*
+		 * we are inverting the inode_lru_lock/inode->i_lock here,
+		 * so use a trylock. If we fail to get the lock, just move the
+		 * inode to the back of the list so we don't spin on it.
+		 */
+		if (!spin_trylock(&inode->i_lock)) {
+			list_move(&inode->i_lru, &inode_lru);
+			continue;
+		}
+
+		/*
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 */
-		spin_lock(&inode->i_lock);
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			spin_unlock(&inode->i_lock);
@@ -652,17 +660,21 @@ static void prune_icache(int nr_to_scan)
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_lru_lock);
 			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
 			spin_lock(&inode_lock);
+			spin_lock(&inode_lru_lock);
 
 			if (inode != list_entry(inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
-			spin_lock(&inode->i_lock);
+			/* avoid lock inversions with trylock */
+			if (!spin_trylock(&inode->i_lock))
+				continue;
 			if (!can_unuse(inode)) {
 				spin_unlock(&inode->i_lock);
 				continue;
@@ -683,6 +695,7 @@ static void prune_icache(int nr_to_scan)
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
 		__count_vm_events(PGINODESTEAL, reap);
+	spin_unlock(&inode_lru_lock);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-27 22:24     ` Dave Chinner
@ 2010-10-28 10:25       ` Christoph Hellwig
  2010-10-28 10:58         ` Dave Chinner
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2010-10-28 10:25 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, viro, linux-fsdevel, linux-kernel

> That doesn't happen because the counter is only modified when
> the inode is moved on/off the list and there are checks to avoid
> removing an inode that is not on the list. Also, the inode is not
> removed from the LRU in dispose_one_inode - it is always done when
> the inode is marked I_FREEING while the i_lock is held before
> calling dispose_one_inode().
> 
> Basically I wanted to remove the strange "inode is not on the LRU if
> it is dirty or under writeback" accounting checks and make the
> accounting symmetric with adding/removing the inodes from the LRU.
> These are protected by list_empty() checks, so should always end up
> with the correct accounting.
> 
> hence the only special case now is prune_icache() which already
> holds the inode_lru_lock() so can't call the helper. Besides, we
> don't need any checks there because we know the inode is on the LRU
> already....

Indeed.  What about adding a

	BUG_ON(!list_empty(&inode->i_lru));

to evict to ensure this invariant?


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/4] fs: Lock the inode LRU list separately
  2010-10-28 10:25       ` Christoph Hellwig
@ 2010-10-28 10:58         ` Dave Chinner
  0 siblings, 0 replies; 26+ messages in thread
From: Dave Chinner @ 2010-10-28 10:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: viro, linux-fsdevel, linux-kernel

On Thu, Oct 28, 2010 at 06:25:21AM -0400, Christoph Hellwig wrote:
> > That doesn't happen because the counter is only modified when
> > the inode is moved on/off the list and there are checks to avoid
> > removing an inode that is not on the list. Also, the inode is not
> > removed from the LRU in dispose_one_inode - it is always done when
> > the inode is marked I_FREEING while the i_lock is held before
> > calling dispose_one_inode().
> > 
> > Basically I wanted to remove the strange "inode is not on the LRU if
> > it is dirty or under writeback" accounting checks and make the
> > accounting symmetric with adding/removing the inodes from the LRU.
> > These are protected by list_empty() checks, so should always end up
> > with the correct accounting.
> > 
> > hence the only special case now is prune_icache() which already
> > holds the inode_lru_lock() so can't call the helper. Besides, we
> > don't need any checks there because we know the inode is on the LRU
> > already....
> 
> Indeed.  What about adding a
> 
> 	BUG_ON(!list_empty(&inode->i_lru));
> 
> to evict to ensure this invariant?

Yup, sounds like a good idea.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2010-10-28 10:58 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-27  4:23 fs: break out inode LRU operations from node_lock Dave Chinner
2010-10-27  4:23 ` [PATCH 1/4] fs: protect inode->i_state with inode->i_lock Dave Chinner
2010-10-27  7:07   ` Christian Stroetmann
2010-10-27  8:58   ` Christoph Hellwig
2010-10-27  4:23 ` [PATCH 2/4] fs: factor inode disposal Dave Chinner
2010-10-27  7:55   ` Christoph Hellwig
2010-10-27  9:06   ` Christoph Hellwig
2010-10-27  4:23 ` [PATCH 3/4] fs: Lock the inode LRU list separately Dave Chinner
2010-10-27  7:08   ` Christian Stroetmann
2010-10-27  9:05   ` Christoph Hellwig
2010-10-27 22:24     ` Dave Chinner
2010-10-28 10:25       ` Christoph Hellwig
2010-10-28 10:58         ` Dave Chinner
2010-10-27  4:23 ` [PATCH 4/4] fs: remove inode_lock from iput_final and prune_icache Dave Chinner
2010-10-27  4:40   ` Al Viro
2010-10-27  4:47     ` Eric Dumazet
2010-10-27  4:47       ` Eric Dumazet
2010-10-27  5:25       ` Al Viro
2010-10-27  5:50         ` Eric Dumazet
2010-10-27  5:50           ` Eric Dumazet
2010-10-27  6:01           ` Al Viro
2010-10-27  6:09           ` Davidlohr Bueso
2010-10-27  7:11           ` Christian Stroetmann
2010-10-27  9:12     ` Dave Chinner
2010-10-27  9:12       ` Dave Chinner
  -- strict thread matches above, loose matches on Subject: below --
2010-10-27 23:02 fs: break out inode LRU operations from inode_lock V2 Dave Chinner
2010-10-27 23:02 ` [PATCH 3/4] fs: Lock the inode LRU list separately Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.