From: npiggin@kernel.dk
To: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
npiggin@kernel.dk
Subject: [patch 07/14] fs: icache lock lru/writeback lists
Date: Fri, 22 Oct 2010 00:08:36 +1100 [thread overview]
Message-ID: <20101021131016.640688024@kernel.dk> (raw)
In-Reply-To: 20101021130829.442910807@kernel.dk
[-- Attachment #1: fs-inode_lock-scale-6.patch --]
[-- Type: text/plain, Size: 14650 bytes --]
Add a new lock, wb_inode_list_lock, to protect i_list and various lists
which the inode can be put onto.
[note: inode_lock should be able to be lifted a bit further off most
io list walks, but perhaps not lru walks yet]
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
fs/fs-writeback.c | 54 ++++++++++++++++++++++++++++++++++++---
fs/inode.c | 63 ++++++++++++++++++++++++++++++++++++++++++----
fs/internal.h | 1
include/linux/writeback.h | 1
mm/backing-dev.c | 4 ++
5 files changed, 114 insertions(+), 9 deletions(-)
Index: linux-2.6/fs/fs-writeback.c
===================================================================
--- linux-2.6.orig/fs/fs-writeback.c 2010-10-21 23:50:27.000000000 +1100
+++ linux-2.6/fs/fs-writeback.c 2010-10-21 23:50:44.000000000 +1100
@@ -169,6 +169,7 @@ static void redirty_tail(struct inode *i
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ assert_spin_locked(&wb_inode_list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -186,6 +187,7 @@ static void requeue_io(struct inode *ino
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ assert_spin_locked(&wb_inode_list_lock);
list_move(&inode->i_list, &wb->b_more_io);
}
@@ -226,6 +228,7 @@ static void move_expired_inodes(struct l
struct inode *inode;
int do_sb_sort = 0;
+ assert_spin_locked(&wb_inode_list_lock);
while (!list_empty(delaying_queue)) {
inode = list_entry(delaying_queue->prev, struct inode, i_list);
if (older_than_this &&
@@ -289,11 +292,13 @@ static void inode_wait_for_writeback(str
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
+ spin_lock(&wb_inode_list_lock);
}
}
@@ -347,6 +352,7 @@ writeback_single_inode(struct inode *ino
/* Set I_SYNC, reset I_DIRTY_PAGES */
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
@@ -383,6 +389,7 @@ writeback_single_inode(struct inode *ino
spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
+ spin_lock(&wb_inode_list_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -469,11 +476,19 @@ static bool pin_sb_for_writeback(struct
static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
struct writeback_control *wbc, bool only_this_sb)
{
+lock_again:
while (!list_empty(&wb->b_io)) {
long pages_skipped;
struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
+ if (!spin_trylock(&inode->i_lock)) {
+ spin_unlock(&wb_inode_list_lock);
+ cpu_relax();
+ spin_lock(&wb_inode_list_lock);
+ goto lock_again;
+ }
+
if (inode->i_sb != sb) {
if (only_this_sb) {
/*
@@ -482,9 +497,12 @@ static int writeback_sb_inodes(struct su
* to it back onto the dirty list.
*/
redirty_tail(inode);
+ spin_unlock(&inode->i_lock);
continue;
}
+ spin_unlock(&inode->i_lock);
+
/*
* The inode belongs to a different superblock.
* Bounce back to the caller to unpin this and
@@ -493,10 +511,9 @@ static int writeback_sb_inodes(struct su
return 0;
}
- spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
requeue_io(inode);
+ spin_unlock(&inode->i_lock);
continue;
}
/*
@@ -509,7 +526,7 @@ static int writeback_sb_inodes(struct su
}
BUG_ON(inode->i_state & I_FREEING);
- inode_get_ilock(inode);
+ inode_get_ilock_wblock(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
if (wbc->pages_skipped != pages_skipped) {
@@ -519,11 +536,13 @@ static int writeback_sb_inodes(struct su
*/
redirty_tail(inode);
}
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
iput(inode);
cond_resched();
spin_lock(&inode_lock);
+ spin_lock(&wb_inode_list_lock);
if (wbc->nr_to_write <= 0) {
wbc->more_io = 1;
return 1;
@@ -543,6 +562,9 @@ void writeback_inodes_wb(struct bdi_writ
if (!wbc->wb_start)
wbc->wb_start = jiffies; /* livelock avoidance */
spin_lock(&inode_lock);
+lock_again:
+ spin_lock(&wb_inode_list_lock);
+
if (!wbc->for_kupdate || list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
@@ -552,7 +574,13 @@ void writeback_inodes_wb(struct bdi_writ
struct super_block *sb = inode->i_sb;
if (!pin_sb_for_writeback(sb)) {
+ if (!spin_trylock(&inode->i_lock)) {
+ spin_unlock(&wb_inode_list_lock);
+ cpu_relax();
+ goto lock_again;
+ }
requeue_io(inode);
+ spin_unlock(&inode->i_lock);
continue;
}
ret = writeback_sb_inodes(sb, wb, wbc, false);
@@ -561,6 +589,7 @@ void writeback_inodes_wb(struct bdi_writ
if (ret)
break;
}
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode_lock);
/* Leave any unwritten inodes on b_io */
}
@@ -571,9 +600,11 @@ static void __writeback_inodes_sb(struct
WARN_ON(!rwsem_is_locked(&sb->s_umount));
spin_lock(&inode_lock);
+ spin_lock(&wb_inode_list_lock);
if (!wbc->for_kupdate || list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
writeback_sb_inodes(sb, wb, wbc, true);
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode_lock);
}
@@ -685,12 +716,21 @@ static long wb_writeback(struct bdi_writ
* we'll just busyloop.
*/
spin_lock(&inode_lock);
+lock_again:
+ spin_lock(&wb_inode_list_lock);
if (!list_empty(&wb->b_more_io)) {
inode = list_entry(wb->b_more_io.prev,
struct inode, i_list);
+ if (!spin_trylock(&inode->i_lock)) {
+ spin_unlock(&wb_inode_list_lock);
+ cpu_relax();
+ goto lock_again;
+ }
trace_wbc_writeback_wait(&wbc, wb->bdi);
inode_wait_for_writeback(inode);
+ spin_unlock(&inode->i_lock);
}
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode_lock);
}
@@ -1002,7 +1042,9 @@ void __mark_inode_dirty(struct inode *in
}
inode->dirtied_when = jiffies;
+ spin_lock(&wb_inode_list_lock);
list_move(&inode->i_list, &bdi->wb.b_dirty);
+ spin_unlock(&wb_inode_list_lock);
}
}
out:
@@ -1069,7 +1111,7 @@ static void wait_sb_inodes(struct super_
spin_unlock(&inode->i_lock);
continue;
}
- inode_get_ilock(inode);
+ inode_get_ilock_wblock(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&sb_inode_list_lock);
spin_unlock(&inode_lock);
@@ -1198,7 +1240,9 @@ int write_inode_now(struct inode *inode,
might_sleep();
spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
+ spin_lock(&wb_inode_list_lock);
ret = writeback_single_inode(inode, &wbc);
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
if (sync)
@@ -1224,7 +1268,9 @@ int sync_inode(struct inode *inode, stru
spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
+ spin_lock(&wb_inode_list_lock);
ret = writeback_single_inode(inode, wbc);
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
return ret;
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c 2010-10-21 23:50:27.000000000 +1100
+++ linux-2.6/fs/inode.c 2010-10-21 23:50:44.000000000 +1100
@@ -41,12 +41,16 @@
* s_inodes, i_sb_list
* inode_hash_lock protects:
* inode hash table, i_hash
+ * wb_inode_list_lock protects:
+ * inode_in_use, inode_unused, b_io, b_more_io, b_dirty, i_list
*
* Ordering:
* inode_lock
* i_lock
* sb_inode_list_lock
+ * wb_inode_list_lock
* inode_hash_lock
+ * wb_inode_list_lock
*/
/*
* This is needed for the following functions:
@@ -107,6 +111,7 @@ static struct hlist_head *inode_hashtabl
*/
DEFINE_SPINLOCK(inode_lock);
DEFINE_SPINLOCK(sb_inode_list_lock);
+DEFINE_SPINLOCK(wb_inode_list_lock);
static DEFINE_SPINLOCK(inode_hash_lock);
/*
@@ -319,6 +324,26 @@ void __inode_get(struct inode *inode)
EXPORT_SYMBOL(__inode_get);
/*
+ * Don't fret, this is going away when inode_get callers and implementations
+ * get much simpler with lazy inode LRU.
+ */
+void inode_get_ilock_wblock(struct inode *inode)
+{
+ assert_spin_locked(&inode_lock);
+ assert_spin_locked(&inode->i_lock);
+ assert_spin_locked(&wb_inode_list_lock);
+ BUG_ON(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE));
+ inode->i_count++;
+ if (inode->i_count != 1)
+ return;
+
+ if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+ list_move(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/*
* inode_lock must be held
*/
void inode_get_ilock(struct inode *inode)
@@ -330,8 +355,11 @@ void inode_get_ilock(struct inode *inode
if (inode->i_count != 1)
return;
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+ if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+ spin_lock(&wb_inode_list_lock);
list_move(&inode->i_list, &inode_in_use);
+ spin_unlock(&wb_inode_list_lock);
+ }
inodes_stat.nr_unused--;
}
EXPORT_SYMBOL(inode_get_ilock);
@@ -387,6 +415,7 @@ static void dispose_list(struct list_hea
while (!list_empty(head)) {
struct inode *inode;
+ /* No locking here, it's a private list now */
inode = list_first_entry(head, struct inode, i_list);
list_del(&inode->i_list);
@@ -442,7 +471,9 @@ static int invalidate_list(struct super_
}
invalidate_inode_buffers(inode);
if (!inode->i_count) {
+ spin_lock(&wb_inode_list_lock);
list_move(&inode->i_list, dispose);
+ spin_unlock(&wb_inode_list_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
count++;
@@ -519,6 +550,8 @@ static void prune_icache(int nr_to_scan)
down_read(&iprune_sem);
spin_lock(&inode_lock);
+lock_again:
+ spin_lock(&wb_inode_list_lock);
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
@@ -527,14 +560,20 @@ static void prune_icache(int nr_to_scan)
inode = list_entry(inode_unused.prev, struct inode, i_list);
- spin_lock(&inode->i_lock);
+ if (!spin_trylock(&inode->i_lock)) {
+ spin_unlock(&wb_inode_list_lock);
+ cpu_relax();
+ goto lock_again;
+ }
+
if (inode->i_state || inode->i_count) {
list_move(&inode->i_list, &inode_unused);
spin_unlock(&inode->i_lock);
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
- inode_get_ilock(inode);
+ inode_get_ilock_wblock(inode);
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
if (remove_inode_buffers(inode))
@@ -542,7 +581,13 @@ static void prune_icache(int nr_to_scan)
0, -1);
iput(inode);
spin_lock(&inode_lock);
- spin_lock(&inode->i_lock);
+lock_again_2:
+ spin_lock(&wb_inode_list_lock);
+ if (!spin_trylock(&inode->i_lock)) {
+ spin_unlock(&wb_inode_list_lock);
+ cpu_relax();
+ goto lock_again_2;
+ }
if (inode != list_entry(inode_unused.next,
struct inode, i_list)) {
@@ -565,6 +610,7 @@ static void prune_icache(int nr_to_scan)
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode_lock);
dispose_list(&freeable);
@@ -682,7 +728,9 @@ __inode_add_to_lists(struct super_block
struct inode *inode)
{
inodes_stat.nr_inodes++;
+ spin_lock(&wb_inode_list_lock);
list_add(&inode->i_list, &inode_in_use);
+ spin_unlock(&wb_inode_list_lock);
spin_lock(&sb_inode_list_lock);
list_add(&inode->i_sb_list, &sb->s_inodes);
spin_unlock(&sb_inode_list_lock);
@@ -1376,8 +1424,11 @@ static void iput_final(struct inode *ino
drop = generic_drop_inode(inode);
if (!drop) {
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+ if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+ spin_lock(&wb_inode_list_lock);
list_move(&inode->i_list, &inode_unused);
+ spin_unlock(&wb_inode_list_lock);
+ }
inodes_stat.nr_unused++;
if (sb->s_flags & MS_ACTIVE) {
spin_unlock(&inode->i_lock);
@@ -1398,7 +1449,9 @@ static void iput_final(struct inode *ino
hlist_del_init(&inode->i_hash);
spin_unlock(&inode_hash_lock);
}
+ spin_lock(&wb_inode_list_lock);
list_del_init(&inode->i_list);
+ spin_unlock(&wb_inode_list_lock);
spin_lock(&sb_inode_list_lock);
list_del_init(&inode->i_sb_list);
spin_unlock(&sb_inode_list_lock);
Index: linux-2.6/include/linux/writeback.h
===================================================================
--- linux-2.6.orig/include/linux/writeback.h 2010-10-21 23:50:27.000000000 +1100
+++ linux-2.6/include/linux/writeback.h 2010-10-21 23:50:42.000000000 +1100
@@ -11,6 +11,7 @@ struct backing_dev_info;
extern spinlock_t inode_lock;
extern spinlock_t sb_inode_list_lock;
+extern spinlock_t wb_inode_list_lock;
extern struct list_head inode_in_use;
extern struct list_head inode_unused;
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c 2010-10-21 23:49:53.000000000 +1100
+++ linux-2.6/mm/backing-dev.c 2010-10-21 23:50:43.000000000 +1100
@@ -74,12 +74,14 @@ static int bdi_debug_stats_show(struct s
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
+ spin_lock(&wb_inode_list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_list)
nr_more_io++;
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -683,9 +685,11 @@ void bdi_destroy(struct backing_dev_info
struct bdi_writeback *dst = &default_backing_dev_info.wb;
spin_lock(&inode_lock);
+ spin_lock(&wb_inode_list_lock);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
+ spin_unlock(&wb_inode_list_lock);
spin_unlock(&inode_lock);
}
Index: linux-2.6/fs/internal.h
===================================================================
--- linux-2.6.orig/fs/internal.h 2010-10-21 23:49:57.000000000 +1100
+++ linux-2.6/fs/internal.h 2010-10-21 23:50:41.000000000 +1100
@@ -74,6 +74,7 @@ extern void __init mnt_init(void);
DECLARE_BRLOCK(vfsmount_lock);
+extern void inode_get_ilock_wblock(struct inode *inode);
/*
* fs_struct.c
next prev parent reply other threads:[~2010-10-21 13:08 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-10-21 13:08 [patch 00/14] reworked minimal inode_lock breaking series npiggin
2010-10-21 13:08 ` [patch 01/14] fs: icache begin inode_lock lock breaking npiggin
2010-10-21 13:08 ` [patch 02/14] fs: icache lock i_count npiggin
2010-10-21 13:08 ` [patch 03/14] fs: icache lock inodes icache state npiggin
2010-10-21 13:08 ` [patch 04/14] fs: icache unmount code cleanup npiggin
2010-10-21 13:08 ` [patch 05/14] fs: icache lock s_inodes list npiggin
2010-10-21 13:08 ` [patch 06/14] fs: icache lock inode hash npiggin
2010-10-21 13:08 ` npiggin [this message]
2010-10-21 13:08 ` [patch 08/14] fs: icache make nr_inodes and nr_unused atomic npiggin
2010-10-21 13:08 ` [patch 09/14] fs: inode atomic last_ino, iunique lock npiggin
2010-10-21 13:08 ` [patch 10/14] fs: icache remove inode_lock npiggin
2010-10-21 13:08 ` [patch 11/14] fs: icache factor hash lock into functions npiggin
2010-10-21 13:08 ` [patch 12/14] fs: icache lazy inode lru npiggin
2010-10-21 13:08 ` [patch 13/14] fs: icache split IO and LRU lists npiggin
2010-10-21 15:28 ` Christoph Lameter
2010-10-22 0:00 ` Nick Piggin
2010-10-22 1:05 ` Nick Piggin
2010-10-21 13:08 ` [patch 14/14] fs: icache split writeback and lru locks npiggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20101021131016.640688024@kernel.dk \
--to=npiggin@kernel.dk \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).