From: Christian Stroetmann <stroetmann@ontolinux.com>
To: Dave Chinner <david@fromorbit.com>
Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH 04/21] fs: Implement lazy LRU updates for inodes
Date: Thu, 21 Oct 2010 04:14:23 +0200 [thread overview]
Message-ID: <4CBFA1FF.5030109@ontolinux.com> (raw)
In-Reply-To: <1287622186-1935-5-git-send-email-david@fromorbit.com>
Just only 2 typos
On the 21.10.2010 02:49, Dave Chinner wrote:
> From: Nick Piggin<npiggin@suse.de>
>
> Convert the inode LRU to use lazy updates to reduce lock and
> cacheline traffic. We avoid moving inodes around in the LRU list
> during iget/iput operations so these frequent operations don't need
> to access the LRUs. Instead, we defer the refcount checks to
> reclaim-time and use a per-inode state flag, I_REFERENCED, to tell
> reclaim that iget has touched the inode in the past. This means that
> only reclaim should be touching the LRU with any frequency, hence
> significantly reducing lock acquisitions and the amount contention
> on LRU updates.
>
> This also removes the inode_in_use list, which means we now only
> have one list for tracking the inode LRU status. This makes it much
> simpler to split out the LRU list operations under it's own lock.
>
> Signed-off-by: Nick Piggin<npiggin@suse.de>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> Reviewed-by: Christoph Hellwig<hch@lst.de>
> ---
> fs/fs-writeback.c | 14 +++---
> fs/inode.c | 111 +++++++++++++++++++++++++++-----------------
> fs/internal.h | 6 +++
> include/linux/fs.h | 13 +++---
> include/linux/writeback.h | 1 -
> 5 files changed, 88 insertions(+), 57 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 58a95b7..33e9857 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -408,16 +408,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> * completion.
> */
> redirty_tail(inode);
> - } else if (atomic_read(&inode->i_count)) {
> - /*
> - * The inode is clean, inuse
> - */
> - list_move(&inode->i_list,&inode_in_use);
> } else {
> /*
> - * The inode is clean, unused
> + * The inode is clean. If it is unused, then make sure
> + * that it is put on the LRU correctly as iput_final()
> + * does not move dirty inodes to the LRU and dirty
> + * inodes are removed from the LRU during scanning.
> */
> - list_move(&inode->i_list,&inode_unused);
> + list_del_init(&inode->i_list);
> + if (!atomic_read(&inode->i_count))
> + inode_lru_list_add(inode);
> }
> }
> inode_sync_complete(inode);
> diff --git a/fs/inode.c b/fs/inode.c
> index b3b6a4b..f47ec71 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -72,7 +72,6 @@ static unsigned int i_hash_shift __read_mostly;
> * allowing for low-overhead inode sync() operations.
> */
>
> -LIST_HEAD(inode_in_use);
> LIST_HEAD(inode_unused);
> static struct hlist_head *inode_hashtable __read_mostly;
>
> @@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode)
> INIT_HLIST_NODE(&inode->i_hash);
> INIT_LIST_HEAD(&inode->i_dentry);
> INIT_LIST_HEAD(&inode->i_devices);
> + INIT_LIST_HEAD(&inode->i_list);
> INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
> spin_lock_init(&inode->i_data.tree_lock);
> spin_lock_init(&inode->i_data.i_mmap_lock);
> @@ -317,12 +317,21 @@ static void init_once(void *foo)
> */
> void __iget(struct inode *inode)
> {
> - if (atomic_inc_return(&inode->i_count) != 1)
> - return;
> + atomic_inc(&inode->i_count);
> +}
> +
> +void inode_lru_list_add(struct inode *inode)
> +{
> + list_add(&inode->i_list,&inode_unused);
> + percpu_counter_inc(&nr_inodes_unused);
> +}
>
> - if (!(inode->i_state& (I_DIRTY|I_SYNC)))
> - list_move(&inode->i_list,&inode_in_use);
> - percpu_counter_dec(&nr_inodes_unused);
> +void inode_lru_list_del(struct inode *inode)
> +{
> + if (!list_empty(&inode->i_list)) {
> + list_del_init(&inode->i_list);
> + percpu_counter_dec(&nr_inodes_unused);
> + }
> }
>
> void end_writeback(struct inode *inode)
> @@ -367,7 +376,7 @@ static void dispose_list(struct list_head *head)
> struct inode *inode;
>
> inode = list_first_entry(head, struct inode, i_list);
> - list_del(&inode->i_list);
> + list_del_init(&inode->i_list);
>
> evict(inode);
>
> @@ -410,9 +419,9 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
> continue;
> invalidate_inode_buffers(inode);
> if (!atomic_read(&inode->i_count)) {
> - list_move(&inode->i_list, dispose);
> WARN_ON(inode->i_state& I_NEW);
> inode->i_state |= I_FREEING;
> + list_move(&inode->i_list, dispose);
> percpu_counter_dec(&nr_inodes_unused);
> continue;
> }
> @@ -447,31 +456,21 @@ int invalidate_inodes(struct super_block *sb)
> }
> EXPORT_SYMBOL(invalidate_inodes);
>
> -static int can_unuse(struct inode *inode)
> -{
> - if (inode->i_state)
> - return 0;
> - if (inode_has_buffers(inode))
> - return 0;
> - if (atomic_read(&inode->i_count))
> - return 0;
> - if (inode->i_data.nrpages)
> - return 0;
> - return 1;
> -}
> -
> /*
> - * Scan `goal' inodes on the unused list for freeable ones. They are moved to
> - * a temporary list and then are freed outside inode_lock by dispose_list().
> + * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
> + * temporary list and then are freed outside inode_lock by dispose_list().
> *
> * Any inodes which are pinned purely because of attached pagecache have their
> - * pagecache removed. We expect the final iput() on that inode to add it to
> - * the front of the inode_unused list. So look for it there and if the
> - * inode is still freeable, proceed. The right inode is found 99.9% of the
> - * time in testing on a 4-way.
> + * pagecache removed. If the inode has metadata buffers attached to
> + * mapping->private_list then try to remove them.
> *
> - * If the inode has metadata buffers attached to mapping->private_list then
> - * try to remove them.
> + * If the inode has the I_REFERENCED flag set, it means that it has been used
, then it means
> + * recently - the flag is set in iput_final(). When we encounter such an inode,
> + * clear the flag and move it to the back of the LRU so it gets another pass
> + * through the LRU before it gets reclaimed. This is necessary because of the
> + * fact we are doing lazy LRU updates to minimise lock contention so the LRU
> + * does not have strict ordering. Hence we don't want to reclaim inodes with
> + * this flag set because they are the inodes that are out of order.
> */
> static void prune_icache(int nr_to_scan)
> {
> @@ -489,8 +488,21 @@ static void prune_icache(int nr_to_scan)
>
> inode = list_entry(inode_unused.prev, struct inode, i_list);
>
> - if (inode->i_state || atomic_read(&inode->i_count)) {
> + /*
> + * Referenced or dirty inodes are still in use. Give them
> + * another pass through the LRU as we canot reclaim them now.
> + */
> + if (atomic_read(&inode->i_count) ||
> + (inode->i_state& ~I_REFERENCED)) {
> + list_del_init(&inode->i_list);
> + percpu_counter_dec(&nr_inodes_unused);
> + continue;
> + }
> +
> + /* recently referenced inodes get one more pass */
> + if (inode->i_state& I_REFERENCED) {
> list_move(&inode->i_list,&inode_unused);
> + inode->i_state&= ~I_REFERENCED;
> continue;
> }
> if (inode_has_buffers(inode) || inode->i_data.nrpages) {
> @@ -500,13 +512,19 @@ static void prune_icache(int nr_to_scan)
> reap += invalidate_mapping_pages(&inode->i_data,
> 0, -1);
> iput(inode);
> - spin_lock(&inode_lock);
>
> - if (inode != list_entry(inode_unused.next,
> - struct inode, i_list))
> - continue; /* wrong inode or list_empty */
> - if (!can_unuse(inode))
> - continue;
> + /*
> + * Rather than try to determine if we can still use the
> + * inode after calling iput(), leave the inode where it
> + * is on the LRU. If we race with another reclaimer,
, then that reclaimer
> + * that reclaimer will either see a reference count
> + * or the I_REFERENCED flag, and move the inode to the
> + * back of the LRU. If we don't race, then we'll see
> + * the I_REFERENCED flag on the next pass and do the
> + * same. Either way, we won't spin on it in this loop.
> + */
> + spin_lock(&inode_lock);
> + continue;
[...]
next prev parent reply other threads:[~2010-10-21 2:16 UTC|newest]
Thread overview: 58+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-10-21 0:49 Inode Lock Scalability V6 Dave Chinner
2010-10-21 0:49 ` [PATCH 01/21] fs: switch bdev inode bdi's correctly Dave Chinner
2010-10-21 0:49 ` [PATCH 02/21] kernel: add bl_list Dave Chinner
2010-10-21 0:49 ` [PATCH 03/21] fs: Convert nr_inodes and nr_unused to per-cpu counters Dave Chinner
2010-10-21 0:49 ` [PATCH 04/21] fs: Implement lazy LRU updates for inodes Dave Chinner
2010-10-21 2:14 ` Christian Stroetmann [this message]
2010-10-21 10:07 ` Nick Piggin
2010-10-21 12:22 ` Christoph Hellwig
2010-10-23 9:32 ` Al Viro
2010-10-21 0:49 ` [PATCH 05/21] fs: inode split IO and LRU lists Dave Chinner
2010-10-21 0:49 ` [PATCH 06/21] fs: Clean up inode reference counting Dave Chinner
2010-10-21 1:41 ` Christoph Hellwig
2010-10-21 0:49 ` [PATCH 07/21] exofs: use iput() for inode reference count decrements Dave Chinner
2010-10-21 0:49 ` [PATCH 08/21] fs: rework icount to be a locked variable Dave Chinner
2010-10-21 19:40 ` Al Viro
2010-10-21 22:32 ` Dave Chinner
2010-10-21 0:49 ` [PATCH 09/21] fs: Factor inode hash operations into functions Dave Chinner
2010-10-21 0:49 ` [PATCH 10/21] fs: Stop abusing find_inode_fast in iunique Dave Chinner
2010-10-21 0:49 ` [PATCH 11/21] fs: move i_ref increments into find_inode/find_inode_fast Dave Chinner
2010-10-21 0:49 ` [PATCH 12/21] fs: remove inode_add_to_list/__inode_add_to_list Dave Chinner
2010-10-21 0:49 ` [PATCH 13/21] fs: Introduce per-bucket inode hash locks Dave Chinner
2010-10-21 0:49 ` [PATCH 14/21] fs: add a per-superblock lock for the inode list Dave Chinner
2010-10-21 0:49 ` [PATCH 15/21] fs: split locking of inode writeback and LRU lists Dave Chinner
2010-10-21 0:49 ` [PATCH 16/21] fs: Protect inode->i_state with the inode->i_lock Dave Chinner
2010-10-22 1:56 ` Al Viro
2010-10-22 2:26 ` Nick Piggin
2010-10-22 3:14 ` Dave Chinner
2010-10-22 10:37 ` Al Viro
2010-10-22 11:40 ` Christoph Hellwig
2010-10-23 21:40 ` Al Viro
2010-10-23 21:37 ` Al Viro
2010-10-24 14:13 ` Christoph Hellwig
2010-10-24 16:21 ` Christoph Hellwig
2010-10-24 19:17 ` Al Viro
2010-10-24 20:04 ` Christoph Hellwig
2010-10-24 20:36 ` Al Viro
2010-10-24 2:18 ` Nick Piggin
2010-10-21 0:49 ` [PATCH 17/21] fs: protect wake_up_inode with inode->i_lock Dave Chinner
2010-10-21 2:17 ` Christoph Hellwig
2010-10-21 13:16 ` Nick Piggin
2010-10-21 0:49 ` [PATCH 18/21] fs: introduce a per-cpu last_ino allocator Dave Chinner
2010-10-21 0:49 ` [PATCH 19/21] fs: icache remove inode_lock Dave Chinner
2010-10-21 2:14 ` Christian Stroetmann
2010-10-21 0:49 ` [PATCH 20/21] fs: Reduce inode I_FREEING and factor inode disposal Dave Chinner
2010-10-21 0:49 ` [PATCH 21/21] fs: do not assign default i_ino in new_inode Dave Chinner
2010-10-21 5:04 ` Inode Lock Scalability V7 (was V6) Dave Chinner
2010-10-21 13:20 ` Nick Piggin
2010-10-21 23:52 ` Dave Chinner
2010-10-22 0:45 ` Nick Piggin
2010-10-22 2:20 ` Al Viro
2010-10-22 2:34 ` Nick Piggin
2010-10-22 2:41 ` Nick Piggin
2010-10-22 2:48 ` Nick Piggin
2010-10-22 3:12 ` Al Viro
2010-10-22 4:48 ` Nick Piggin
2010-10-22 3:07 ` Al Viro
2010-10-22 4:46 ` Nick Piggin
2010-10-22 5:01 ` Nick Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4CBFA1FF.5030109@ontolinux.com \
--to=stroetmann@ontolinux.com \
--cc=david@fromorbit.com \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).