From: Dave Chinner <david@fromorbit.com>
To: xfs@oss.sgi.com
Subject: [PATCH 05/18] xfs: convert inode cache lookups to use RCU locking
Date: Tue, 14 Sep 2010 20:56:04 +1000 [thread overview]
Message-ID: <1284461777-1496-6-git-send-email-david@fromorbit.com> (raw)
In-Reply-To: <1284461777-1496-1-git-send-email-david@fromorbit.com>
From: Dave Chinner <dchinner@redhat.com>
With delayed logging greatly increasing the sustained parallelism of inode
operations, the inode cache locking is showing significant read vs write
contention when inode reclaim runs at the same time as lookups. There is
also a lot more write lock acquistions than there are read locks (4:1 ratio)
so the read locking is not really buying us much in the way of parallelism.
To avoid the read vs write contention, change the cache to use RCU locking on
the read side. To avoid needing to RCU free every single inode, use the built
in slab RCU freeing mechanism. This requires us to be able to detect lookups of
freed inodes, so enѕure that ever freed inode has an inode number of zero and
the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit
lookup path, but also add a check for a zero inode number as well.
We canthen convert all the read locking lockups to use RCU read side locking
and hence remove all read side locking.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/kmem.h | 1 +
fs/xfs/linux-2.6/xfs_super.c | 3 ++-
fs/xfs/linux-2.6/xfs_sync.c | 12 ++++++------
fs/xfs/quota/xfs_qm_syscalls.c | 4 ++--
fs/xfs/xfs_iget.c | 36 +++++++++++++++++++++++++++++-------
fs/xfs/xfs_inode.c | 22 ++++++++++++++--------
6 files changed, 54 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index f7c8f7a..c0fe7ef 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -82,6 +82,7 @@ extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+#define KM_ZONE_RCU SLAB_DESTROY_BY_RCU
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e0797..6205eb8 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1723,7 +1723,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+ KM_ZONE_RCU | KM_ZONE_HWALIGN |
+ KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index bc54cd6..e549d67 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -102,13 +102,13 @@ restart:
if (exclusive)
write_lock(&pag->pag_ici_lock);
else
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
if (!ip) {
if (exclusive)
write_unlock(&pag->pag_ici_lock);
else
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
break;
}
@@ -204,11 +204,10 @@ xfs_inode_ag_iterator(
return XFS_ERROR(last_error);
}
-/* must be called with pag_ici_lock held and releases it */
int
xfs_sync_inode_valid(
struct xfs_inode *ip,
- struct xfs_perag *pag)
+ struct xfs_perag *pag) __releases(RCU)
{
struct inode *inode = VFS_I(ip);
int error = EFSCORRUPTED;
@@ -219,7 +218,8 @@ xfs_sync_inode_valid(
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
error = ENOENT;
- if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ if (ip->i_ino == 0 ||
+ xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock;
/* If we can't grab the inode, it must on it's way to reclaim. */
@@ -234,7 +234,7 @@ xfs_sync_inode_valid(
/* inode is valid */
error = 0;
out_unlock:
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
return error;
}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849..ab9cafc 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -873,7 +873,7 @@ STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
struct xfs_perag *pag,
- int flags)
+ int flags) __releases(RCU)
{
int error;
@@ -882,7 +882,7 @@ xfs_dqrele_inode(
ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
return 0;
}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f..f3a46b6 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,6 +69,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
+ ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -134,6 +135,13 @@ xfs_inode_free(
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
+ /*
+ * because we use SLAB_DESTROY_BY_RCU freeing, ensure the inode
+ * always appears to be reclaimed with an invalid inode number
+ * when in the free state.
+ */
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
kmem_zone_free(xfs_inode_zone, ip);
}
@@ -145,12 +153,26 @@ xfs_iget_cache_hit(
struct xfs_perag *pag,
struct xfs_inode *ip,
int flags,
- int lock_flags) __releases(pag->pag_ici_lock)
+ int lock_flags) __releases(RCU)
{
struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
int error;
+ /*
+ * check for re-use of an inode within an RCU grace period due to the
+ * radix tree nodes not being updated yet. We monitor for this by
+ * setting the inode number to zero before freeing the inode structure.
+ */
+ if (ip->i_ino == 0) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ rcu_read_unlock();
+ /* Expire the grace period so we don't trip over it again. */
+ synchronize_rcu();
+ return EAGAIN;
+ }
+
spin_lock(&ip->i_flags_lock);
/*
@@ -194,7 +216,7 @@ xfs_iget_cache_hit(
ip->i_flags |= XFS_IRECLAIM;
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
error = -inode_init_always(mp->m_super, inode);
if (error) {
@@ -202,7 +224,7 @@ xfs_iget_cache_hit(
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~XFS_INEW;
@@ -230,7 +252,7 @@ xfs_iget_cache_hit(
/* We've got a live one. */
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
trace_xfs_iget_hit(ip);
}
@@ -244,7 +266,7 @@ xfs_iget_cache_hit(
out_error:
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
return error;
}
@@ -375,7 +397,7 @@ xfs_iget(
again:
error = 0;
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
if (ip) {
@@ -383,7 +405,7 @@ again:
if (error)
goto out_error_or_again;
} else {
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
XFS_STATS_INC(xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f3..6927699 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1999,13 +1999,14 @@ xfs_ifree_cluster(
*/
for (i = 0; i < ninodes; i++) {
retry:
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
/* Inode not in memory or stale, nothing to do */
- if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
- read_unlock(&pag->pag_ici_lock);
+ if (!ip || !ip->i_ino ||
+ xfs_iflags_test(ip, XFS_ISTALE)) {
+ rcu_read_unlock();
continue;
}
@@ -2018,11 +2019,11 @@ retry:
*/
if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
delay(1);
goto retry;
}
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
@@ -2628,7 +2629,7 @@ xfs_iflush_cluster(
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
first_index, inodes_per_cluster);
@@ -2639,6 +2640,11 @@ xfs_iflush_cluster(
iq = ilist[i];
if (iq == ip)
continue;
+
+ /* check we've got a valid inode */
+ if (!iq->i_ino)
+ continue;
+
/* if the inode lies outside this cluster, we're done. */
if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
break;
@@ -2691,7 +2697,7 @@ xfs_iflush_cluster(
}
out_free:
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
kmem_free(ilist);
out_put:
xfs_perag_put(pag);
@@ -2703,7 +2709,7 @@ cluster_corrupt_out:
* Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem.
*/
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
/*
* Clean up the buffer. If it was B_DELWRI, just release it --
* brelse can handle it with no problems. If not, shut down the
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
next prev parent reply other threads:[~2010-09-14 10:56 UTC|newest]
Thread overview: 67+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-09-14 10:55 [PATCH 0/18] xfs: metadata and buffer cache scalability improvements Dave Chinner
2010-09-14 10:56 ` [PATCH 01/18] xfs: single thread inode cache shrinking Dave Chinner
2010-09-14 18:48 ` Alex Elder
2010-09-14 22:48 ` Dave Chinner
2010-09-14 10:56 ` [PATCH 02/18] xfs: reduce the number of CIL lock round trips during commit Dave Chinner
2010-09-14 14:48 ` Christoph Hellwig
2010-09-14 17:21 ` Alex Elder
2010-09-14 10:56 ` [PATCH 03/18] xfs: remove debug assert for per-ag reference counting Dave Chinner
2010-09-14 14:48 ` Christoph Hellwig
2010-09-14 17:22 ` Alex Elder
2010-09-14 10:56 ` [PATCH 04/18] xfs: lockless per-ag lookups Dave Chinner
2010-09-14 12:35 ` Dave Chinner
2010-09-14 14:50 ` Christoph Hellwig
2010-09-14 17:28 ` Alex Elder
2010-09-14 10:56 ` Dave Chinner [this message]
2010-09-14 16:27 ` [PATCH 05/18] xfs: convert inode cache lookups to use RCU locking Christoph Hellwig
2010-09-14 23:17 ` Dave Chinner
2010-09-14 21:23 ` Alex Elder
2010-09-14 23:42 ` Dave Chinner
2010-09-14 10:56 ` [PATCH 06/18] xfs: convert pag_ici_lock to a spin lock Dave Chinner
2010-09-14 21:26 ` Alex Elder
2010-09-14 10:56 ` [PATCH 07/18] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
2010-09-14 14:54 ` Christoph Hellwig
2010-09-15 0:14 ` Dave Chinner
2010-09-15 0:17 ` Christoph Hellwig
2010-09-14 22:12 ` Alex Elder
2010-09-15 0:28 ` Dave Chinner
2010-11-08 10:47 ` Christoph Hellwig
2010-09-14 10:56 ` [PATCH 08/18] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
2010-09-14 14:56 ` Christoph Hellwig
2010-09-14 22:14 ` Alex Elder
2010-09-14 10:56 ` [PATCH 09/18] xfs: introduced uncached buffer read primitve Dave Chinner
2010-09-14 14:56 ` Christoph Hellwig
2010-09-14 22:16 ` Alex Elder
2010-09-14 10:56 ` [PATCH 10/18] xfs: store xfs_mount in the buftarg instead of in the xfs_buf Dave Chinner
2010-09-14 14:57 ` Christoph Hellwig
2010-09-14 22:21 ` Alex Elder
2010-09-14 10:56 ` [PATCH 11/18] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
2010-09-14 14:59 ` Christoph Hellwig
2010-09-14 22:26 ` Alex Elder
2010-09-14 10:56 ` [PATCH 12/18] xfs: use unhashed buffers for size checks Dave Chinner
2010-09-14 15:00 ` Christoph Hellwig
2010-09-14 22:29 ` Alex Elder
2010-09-14 10:56 ` [PATCH 13/18] xfs: remove buftarg hash for external devices Dave Chinner
2010-09-14 22:29 ` Alex Elder
2010-09-14 10:56 ` [PATCH 14/18] xfs: convert buffer cache hash to rbtree Dave Chinner
2010-09-14 16:29 ` Christoph Hellwig
2010-09-15 17:46 ` Alex Elder
2010-09-14 10:56 ` [PATCH 15/18] xfs; pack xfs_buf structure more tightly Dave Chinner
2010-09-14 16:30 ` Christoph Hellwig
2010-09-15 18:01 ` Alex Elder
2010-09-14 10:56 ` [PATCH 16/18] xfs: convert xfsbud shrinker to a per-buftarg shrinker Dave Chinner
2010-09-14 16:32 ` Christoph Hellwig
2010-09-15 20:19 ` Alex Elder
2010-09-16 0:28 ` Dave Chinner
2010-09-14 10:56 ` [PATCH 17/18] xfs: add a lru to the XFS buffer cache Dave Chinner
2010-09-14 23:16 ` Christoph Hellwig
2010-09-15 0:05 ` Dave Chinner
2010-09-15 21:28 ` Alex Elder
2010-09-14 10:56 ` [PATCH 18/18] xfs: stop using the page cache to back the " Dave Chinner
2010-09-14 23:20 ` Christoph Hellwig
2010-09-15 0:06 ` Dave Chinner
2010-09-14 14:25 ` [PATCH 0/18] xfs: metadata and buffer cache scalability improvements Christoph Hellwig
2010-09-17 13:21 ` Alex Elder
2010-09-21 2:02 ` Dave Chinner
2010-09-21 16:23 ` Alex Elder
2010-09-21 22:34 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1284461777-1496-6-git-send-email-david@fromorbit.com \
--to=david@fromorbit.com \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox