All of lore.kernel.org
 help / color / mirror / Atom feed
From: npiggin@kernel.dk
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Subject: [patch 16/35] fs: icache lazy inode lru
Date: Tue, 19 Oct 2010 14:42:32 +1100	[thread overview]
Message-ID: <20101019034657.147371115@kernel.dk> (raw)
In-Reply-To: 20101019034216.319085068@kernel.dk

[-- Attachment #1: fs-inode_lock-scale-10.patch --]
[-- Type: text/plain, Size: 8964 bytes --]

Impelemnt lazy inode lru similarly to dcache. That is, avoid moving inode
around the LRU list in iget/iput operations and defer the refcount check
to reclaim-time. Use a flag, I_REFERENCED, to tell reclaim that iget has
touched the inode in the past.

This will reduce lock acquisition, and will also improve lock ordering
with subsequent patches.

The global inode_in_use list goes away, and !list_empty(&inode->i_list)
invariant goes away. 

Signed-off-by: Nick Piggin <npiggin@kernel.dk>

---
 fs/fs-writeback.c         |    7 ---
 fs/inode.c                |   98 ++++++++++++++++++++++------------------------
 include/linux/fs.h        |   20 ++++++---
 include/linux/writeback.h |    1 
 4 files changed, 61 insertions(+), 65 deletions(-)

Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2010-10-19 14:18:59.000000000 +1100
+++ linux-2.6/fs/inode.c	2010-10-19 14:19:29.000000000 +1100
@@ -94,7 +94,6 @@
  * allowing for low-overhead inode sync() operations.
  */
 
-LIST_HEAD(inode_in_use);
 LIST_HEAD(inode_unused);
 
 struct inode_hash_bucket {
@@ -299,6 +298,7 @@
 	INIT_HLIST_BL_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
+	INIT_LIST_HEAD(&inode->i_list);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -320,25 +320,6 @@
 	inode_init_once(inode);
 }
 
-/*
- * i_lock must be held
- */
-void __iget(struct inode *inode)
-{
-	assert_spin_locked(&inode->i_lock);
-
-	inode->i_count++;
-	if (inode->i_count > 1)
-		return;
-
-	if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-		spin_lock(&wb_inode_list_lock);
-		list_move(&inode->i_list, &inode_in_use);
-		spin_unlock(&wb_inode_list_lock);
-	}
-	atomic_dec(&inodes_stat.nr_unused);
-}
-
 void end_writeback(struct inode *inode)
 {
 	might_sleep();
@@ -383,7 +364,7 @@
 		struct inode *inode;
 
 		inode = list_first_entry(head, struct inode, i_list);
-		list_del(&inode->i_list);
+		list_del_init(&inode->i_list);
 
 		evict(inode);
 
@@ -432,11 +413,12 @@
 		invalidate_inode_buffers(inode);
 		if (!inode->i_count) {
 			spin_lock(&wb_inode_list_lock);
-			list_move(&inode->i_list, dispose);
+			list_del(&inode->i_list);
 			spin_unlock(&wb_inode_list_lock);
 			WARN_ON(inode->i_state & I_NEW);
 			inode->i_state |= I_FREEING;
 			spin_unlock(&inode->i_lock);
+			list_add(&inode->i_list, dispose);
 			count++;
 			continue;
 		}
@@ -476,7 +458,7 @@
 
 static int can_unuse(struct inode *inode)
 {
-	if (inode->i_state)
+	if (inode->i_state & ~I_REFERENCED)
 		return 0;
 	if (inode_has_buffers(inode))
 		return 0;
@@ -504,13 +486,12 @@
 {
 	LIST_HEAD(freeable);
 	int nr_pruned = 0;
-	int nr_scanned;
 	unsigned long reap = 0;
 
 	down_read(&iprune_sem);
 again:
 	spin_lock(&wb_inode_list_lock);
-	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+	for (; nr_to_scan; nr_to_scan--) {
 		struct inode *inode;
 
 		if (list_empty(&inode_unused))
@@ -522,34 +503,47 @@
 			spin_unlock(&wb_inode_list_lock);
 			goto again;
 		}
-		if (inode->i_state || inode->i_count) {
+		if (inode->i_count || (inode->i_state & ~I_REFERENCED)) {
+			list_del_init(&inode->i_list);
+			spin_unlock(&inode->i_lock);
+			atomic_dec(&inodes_stat.nr_unused);
+			continue;
+		}
+		if (inode->i_state & I_REFERENCED) {
 			list_move(&inode->i_list, &inode_unused);
+			inode->i_state &= ~I_REFERENCED;
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+			/*
+			 * Move back to the head of the unused list in case the
+			 * invalidations failed. Could improve this by going to
+			 * the head of the list only if invalidation fails.
+			 *
+			 * We'll try to get it back if it becomes freeable.
+			 */
+			list_move(&inode->i_list, &inode_unused);
 			spin_unlock(&wb_inode_list_lock);
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
+
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
-again2:
 			spin_lock(&wb_inode_list_lock);
-
-			if (inode != list_entry(inode_unused.next,
-						struct inode, i_list))
-				continue;	/* wrong inode or list_empty */
-			if (!spin_trylock(&inode->i_lock)) {
-				spin_unlock(&wb_inode_list_lock);
-				goto again2;
-			}
-			if (!can_unuse(inode)) {
-				spin_unlock(&inode->i_lock);
-				continue;
+			if (inode == list_entry(inode_unused.next,
+						struct inode, i_list)) {
+				if (spin_trylock(&inode->i_lock)) {
+					if (can_unuse(inode))
+						goto freeable;
+					spin_unlock(&inode->i_lock);
+				}
 			}
+			continue;
 		}
+freeable:
 		list_move(&inode->i_list, &freeable);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
@@ -695,9 +689,6 @@
 {
 	list_add(&inode->i_sb_list, &sb->s_inodes);
 	spin_unlock(&sb_inode_list_lock);
-	spin_lock(&wb_inode_list_lock);
-	list_add(&inode->i_list, &inode_in_use);
-	spin_unlock(&wb_inode_list_lock);
 	if (b) {
 		spin_lock_bucket(b);
 		hlist_bl_add_head(&inode->i_hash, &b->head);
@@ -1371,13 +1362,15 @@
 		drop = generic_drop_inode(inode);
 
 	if (!drop) {
-		if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-			spin_lock(&wb_inode_list_lock);
-			list_move(&inode->i_list, &inode_unused);
-			spin_unlock(&wb_inode_list_lock);
-		}
-		atomic_inc(&inodes_stat.nr_unused);
 		if (sb->s_flags & MS_ACTIVE) {
+			inode->i_state |= I_REFERENCED;
+			if (!(inode->i_state & (I_DIRTY|I_SYNC)) &&
+					list_empty(&inode->i_list)) {
+				spin_lock(&wb_inode_list_lock);
+				list_add(&inode->i_list, &inode_unused);
+				spin_unlock(&wb_inode_list_lock);
+				atomic_inc(&inodes_stat.nr_unused);
+			}
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&sb_inode_list_lock);
 			return;
@@ -1392,11 +1385,14 @@
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 		__remove_inode_hash(inode);
-		atomic_dec(&inodes_stat.nr_unused);
 	}
-	spin_lock(&wb_inode_list_lock);
-	list_del_init(&inode->i_list);
-	spin_unlock(&wb_inode_list_lock);
+	if (!list_empty(&inode->i_list)) {
+		spin_lock(&wb_inode_list_lock);
+		list_del_init(&inode->i_list);
+		spin_unlock(&wb_inode_list_lock);
+		if (!inode->i_state)
+			atomic_dec(&inodes_stat.nr_unused);
+	}
 	list_del_init(&inode->i_sb_list);
 	spin_unlock(&sb_inode_list_lock);
 	WARN_ON(inode->i_state & I_NEW);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2010-10-19 14:18:59.000000000 +1100
+++ linux-2.6/include/linux/fs.h	2010-10-19 14:19:28.000000000 +1100
@@ -1637,16 +1637,17 @@
  *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
-#define I_DIRTY_SYNC		1
-#define I_DIRTY_DATASYNC	2
-#define I_DIRTY_PAGES		4
+#define I_DIRTY_SYNC		0x01
+#define I_DIRTY_DATASYNC	0x02
+#define I_DIRTY_PAGES		0x04
 #define __I_NEW			3
 #define I_NEW			(1 << __I_NEW)
-#define I_WILL_FREE		16
-#define I_FREEING		32
-#define I_CLEAR			64
+#define I_WILL_FREE		0x10
+#define I_FREEING		0x20
+#define I_CLEAR			0x40
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define I_REFERENCED		0x100
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -2187,7 +2188,6 @@
 extern int insert_inode_locked(struct inode *);
 extern void unlock_new_inode(struct inode *);
 
-extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
 extern void end_writeback(struct inode *);
 extern void destroy_inode(struct inode *);
@@ -2401,6 +2401,12 @@
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
+static inline void __iget(struct inode *inode)
+{
+	assert_spin_locked(&inode->i_lock);
+	inode->i_count++;
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
 	ino_t res;
Index: linux-2.6/fs/fs-writeback.c
===================================================================
--- linux-2.6.orig/fs/fs-writeback.c	2010-10-19 14:18:59.000000000 +1100
+++ linux-2.6/fs/fs-writeback.c	2010-10-19 14:19:25.000000000 +1100
@@ -416,14 +416,9 @@
 			 * completion.
 			 */
 			redirty_tail(inode);
-		} else if (inode->i_count) {
-			/*
-			 * The inode is clean, inuse
-			 */
-			list_move(&inode->i_list, &inode_in_use);
 		} else {
 			/*
-			 * The inode is clean, unused
+			 * The inode is clean
 			 */
 			list_move(&inode->i_list, &inode_unused);
 		}
Index: linux-2.6/include/linux/writeback.h
===================================================================
--- linux-2.6.orig/include/linux/writeback.h	2010-10-19 14:18:59.000000000 +1100
+++ linux-2.6/include/linux/writeback.h	2010-10-19 14:19:23.000000000 +1100
@@ -11,7 +11,6 @@
 
 extern spinlock_t sb_inode_list_lock;
 extern spinlock_t wb_inode_list_lock;
-extern struct list_head inode_in_use;
 extern struct list_head inode_unused;
 
 /*



  parent reply	other threads:[~2010-10-19  4:01 UTC|newest]

Thread overview: 78+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-19  3:42 [patch 00/35] my inode scaling series for review npiggin
2010-10-19  3:42 ` [patch 01/35] bit_spinlock: add required includes npiggin
2010-10-19  3:42 ` [patch 02/35] kernel: add bl_list npiggin
2010-10-19  3:42 ` [patch 03/35] mm: implement per-zone shrinker npiggin
2010-10-19  3:42   ` npiggin
2010-10-19  4:49   ` KOSAKI Motohiro
2010-10-19  4:49     ` KOSAKI Motohiro
2010-10-19  5:33     ` Nick Piggin
2010-10-19  5:33       ` Nick Piggin
2010-10-19  5:40       ` KOSAKI Motohiro
2010-10-19  5:40         ` KOSAKI Motohiro
2010-10-19  3:42 ` [patch 04/35] vfs: convert inode and dentry caches to " npiggin
2010-10-19  3:42   ` npiggin
2010-10-19  3:42 ` [patch 05/35] fs: icache lock s_inodes list npiggin
2010-10-19  3:42 ` [patch 06/35] fs: icache lock inode hash npiggin
2010-10-19  3:42 ` [patch 07/35] fs: icache lock i_state npiggin
2010-10-19 10:47   ` Miklos Szeredi
2010-10-19 17:06     ` Peter Zijlstra
2010-10-19  3:42 ` [patch 08/35] fs: icache lock i_count npiggin
2010-10-19 10:16   ` Boaz Harrosh
2010-10-20  2:14     ` Nick Piggin
2010-10-19  3:42 ` [patch 09/35] fs: icache lock lru/writeback lists npiggin
2010-10-19  3:42 ` [patch 10/35] fs: icache atomic inodes_stat npiggin
2010-10-19  3:42 ` [patch 11/35] fs: icache lock inode state npiggin
2010-10-19  3:42 ` [patch 12/35] fs: inode atomic last_ino, iunique lock npiggin
2010-10-19  3:42 ` [patch 13/35] fs: icache remove inode_lock npiggin
2010-10-19  3:42 ` [patch 14/35] fs: icache factor hash lock into functions npiggin
2010-10-19  3:42 ` [patch 15/35] fs: icache per-bucket inode hash locks npiggin
2010-10-19  3:42 ` npiggin [this message]
2010-10-19  3:42 ` [patch 17/35] fs: icache RCU free inodes npiggin
2010-10-19  3:42 ` [patch 18/35] fs: avoid inode RCU freeing for pseudo fs npiggin
2010-10-19  3:42 ` [patch 19/35] fs: icache remove redundant i_sb_list umount locking npiggin
2010-10-20 12:46   ` Al Viro
2010-10-20 13:03     ` Nick Piggin
2010-10-20 13:27       ` Al Viro
2010-10-19  3:42 ` [patch 20/35] fs: icache rcu walk for i_sb_list npiggin
2010-10-19  3:42 ` [patch 21/35] fs: icache per-cpu nr_inodes, non-atomic nr_unused counters npiggin
2010-10-19  3:42 ` [patch 22/35] fs: icache per-cpu last_ino allocator npiggin
2010-10-19  3:42 ` [patch 23/35] fs: icache use per-CPU lists and locks for sb inode lists npiggin
2010-10-19 15:33   ` Miklos Szeredi
2010-10-20  2:37     ` Nick Piggin
2010-10-19  3:42 ` [patch 24/35] fs: icache use RCU to avoid locking in hash lookups npiggin
2010-10-19  3:42 ` [patch 25/35] fs: icache reduce some locking overheads npiggin
2010-10-19  3:42 ` [patch 26/35] fs: icache alloc anonymous inode allocation npiggin
2010-10-19 15:50   ` Miklos Szeredi
2010-10-20  2:38     ` Nick Piggin
2010-10-19 16:33   ` Christoph Hellwig
2010-10-20  3:07     ` Nick Piggin
2010-10-19  3:42 ` [patch 27/35] fs: icache split IO and LRU lists npiggin
2010-10-19 16:12   ` Miklos Szeredi
2010-10-20  2:41     ` Nick Piggin
2010-10-19  3:42 ` [patch 28/35] fs: icache split writeback and lru locks npiggin
2010-10-19  3:42 ` [patch 29/35] fs: icache per-bdi writeback list locking npiggin
2010-10-19  3:42 ` [patch 30/35] fs: icache lazy LRU avoid LRU locking after IO operation npiggin
2010-10-19  3:42 ` [patch 31/35] fs: icache per-zone inode LRU npiggin
2010-10-19 12:38   ` Dave Chinner
2010-10-20  2:35     ` Nick Piggin
2010-10-20  3:12       ` Nick Piggin
2010-10-20  3:12         ` Nick Piggin
2010-10-20  9:43         ` Dave Chinner
2010-10-20  9:43           ` Dave Chinner
2010-10-20 10:02           ` Nick Piggin
2010-10-20 10:02             ` Nick Piggin
2010-10-20  3:14     ` KOSAKI Motohiro
2010-10-20  3:20       ` Nick Piggin
2010-10-20  3:29         ` KOSAKI Motohiro
2010-10-20 10:19         ` Dave Chinner
2010-10-20 10:41           ` Nick Piggin
2010-10-19  3:42 ` [patch 32/35] fs: icache minimise I_FREEING latency npiggin
2010-10-19  3:42 ` [patch 33/35] fs: icache introduce inode_get/inode_get_ilock npiggin
2010-10-19 10:17   ` Boaz Harrosh
2010-10-20  2:17     ` Nick Piggin
2010-10-19  3:42 ` [patch 34/35] fs: inode rename i_count to i_refs npiggin
2010-10-19  3:42 ` [patch 35/35] fs: icache document more lock orders npiggin
2010-10-19 16:22 ` [patch 00/35] my inode scaling series for review Christoph Hellwig
2010-10-20  3:05   ` Nick Piggin
2010-10-20 13:14 ` Al Viro
2010-10-20 13:59   ` Nick Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20101019034657.147371115@kernel.dk \
    --to=npiggin@kernel.dk \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.