* [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-04 14:15 ` Christoph Hellwig
0 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2009-08-04 14:15 UTC (permalink / raw)
To: xfs; +Cc: linux-fsdevel, david
[-- Attachment #1: xfs-fix-xfs_iget_cache_hit-locking --]
[-- Type: text/plain, Size: 8123 bytes --]
The locking in xfs_iget_cache_hit currently has numerous problems:
- we clear the reclaim tag without i_flags_lock which protects modifications
to it
- we call inode_init_always which can sleep with pag_ici_lock held
- we acquire and drop i_flags_lock a lot and thus provide no consistency
between the various flags we set/clear under it
This patch fixes all that with a major revamp of the locking in the function.
The new version acquires i_flags_lock early and only drops it once we need to
call into inode_init_always or before calling xfs_ilock.
This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/xfs/xfs_iget.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-01 23:20:54.775080254 +0200
+++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-01 23:20:54.807080483 +0200
@@ -133,80 +133,90 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * This inode is being torn down, pause and try again.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & XFS_IRECLAIM) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+ /*
+ * If we are racing with another cache hit that is currently recycling
+ * this inode out of the XFS_IRECLAIMABLE state, wait for the
+ * initialisation to complete before continuing.
+ */
+ if (ip->i_flags & XFS_INEW) {
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ XFS_STATS_INC(xs_ig_frecycle);
+ wait_on_inode(inode);
+ return EAGAIN;
+ }
+ /*
+ * If lookup is racing with unlink, then we should return an
+ * error immediately so we don't remove it from the reclaim
+ * list and potentially leak the inode.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(pag, ip);
+
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+
+ if (unlikely(!inode_init_always(mp->m_super, inode))) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+
error = ENOMEM;
goto out_error;
}
-
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -216,6 +226,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-01 23:20:31.441330970 +0200
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-01 23:20:54.807080483 +0200
@@ -708,6 +708,17 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ ip->i_flags |= XFS_IRECLAIMABLE;
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,9 +733,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ __xfs_inode_set_reclaim_tag(pag, ip);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag);
@@ -732,27 +741,13 @@ xfs_inode_set_reclaim_tag(
void
__xfs_inode_clear_reclaim_tag(
- xfs_mount_t *mp,
- xfs_perag_t *pag,
- xfs_inode_t *ip)
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
{
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-}
-
-void
-xfs_inode_clear_reclaim_tag(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
-
- read_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
}
STATIC int
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-01 23:20:31.449329683 +0200
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-01 23:20:54.808079772 +0200
@@ -48,9 +48,8 @@ int xfs_reclaim_inode(struct xfs_inode *
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
- struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-04 14:15 ` Christoph Hellwig
(?)
@ 2009-08-06 21:50 ` Eric Sandeen
2009-08-06 22:29 ` Christoph Hellwig
-1 siblings, 1 reply; 38+ messages in thread
From: Eric Sandeen @ 2009-08-06 21:50 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
Any chance this could be broken into 2 patches starting
with the set/clear cleanup something like:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b..49c7b6f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,17 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ ip->i_flags |= XFS_IRECLAIMABLE;
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,9 +733,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ __xfs_inode_set_reclaim_tag(pag, ip);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag);
@@ -732,27 +741,12 @@ xfs_inode_set_reclaim_tag(
void
__xfs_inode_clear_reclaim_tag(
- xfs_mount_t *mp,
- xfs_perag_t *pag,
- xfs_inode_t *ip)
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
{
radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-}
-
-void
-xfs_inode_clear_reclaim_tag(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
-
- read_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301..cb64d20 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,9 +48,8 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
- struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 5fcec6f..94b72d3 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -183,7 +183,7 @@ xfs_iget_cache_hit(
xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
/* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ __xfs_inode_clear_reclaim_tag(pag, ip);
} else if (!igrab(VFS_I(ip))) {
/* If the VFS inode is being torn down, pause and try again. */
XFS_STATS_INC(xs_ig_frecycle);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-06 21:50 ` Eric Sandeen
@ 2009-08-06 22:29 ` Christoph Hellwig
0 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2009-08-06 22:29 UTC (permalink / raw)
To: Eric Sandeen; +Cc: xfs
On Thu, Aug 06, 2009 at 04:50:23PM -0500, Eric Sandeen wrote:
> Any chance this could be broken into 2 patches starting
> with the set/clear cleanup something like:
Let's just drop those for now given that we're late enough in the cycle.
New version below:
--
Subject: xfs: fix locking in xfs_iget_cache_hit
From: Christoph Hellwig <hch@lst.de>
The locking in xfs_iget_cache_hit currently has numerous problems:
- we clear the reclaim tag without i_flags_lock which protects modifications
to it
- we call inode_init_always which can sleep with pag_ici_lock held
- we acquire and drop i_flags_lock a lot and thus provide no consistency
between the various flags we set/clear under it
This patch fixes all that with a major revamp of the locking in the function.
The new version acquires i_flags_lock early and only drops it once we need to
call into inode_init_always or before calling xfs_ilock.
This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/xfs/xfs_iget.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-06 19:25:05.522592017 -0300
+++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-06 19:25:31.760342654 -0300
@@ -133,80 +133,92 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * This inode is being torn down, pause and try again.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & XFS_IRECLAIM) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+ /*
+ * If we are racing with another cache hit that is currently recycling
+ * this inode out of the XFS_IRECLAIMABLE state, wait for the
+ * initialisation to complete before continuing.
+ */
+ if (ip->i_flags & XFS_INEW) {
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ XFS_STATS_INC(xs_ig_frecycle);
+ wait_on_inode(inode);
+ return EAGAIN;
+ }
+ /*
+ * If lookup is racing with unlink, then we should return an
+ * error immediately so we don't remove it from the reclaim
+ * list and potentially leak the inode.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+ ip->i_flags |= XFS_INEW;
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+
+ if (unlikely(!inode_init_always(mp->m_super, inode))) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+
error = ENOMEM;
goto out_error;
}
-
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -216,6 +228,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-06 19:25:05.530591777 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-06 19:25:43.727344574 -0300
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-06 19:25:05.587593862 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-06 19:25:31.764365652 -0300
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-04 14:15 ` Christoph Hellwig
@ 2009-08-07 17:25 ` Felix Blyakher
-1 siblings, 0 replies; 38+ messages in thread
From: Felix Blyakher @ 2009-08-07 17:25 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, xfs
On Aug 4, 2009, at 9:15 AM, Christoph Hellwig wrote:
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects
> modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> - we acquire and drop i_flags_lock a lot and thus provide no
> consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the
> function.
> The new version acquires i_flags_lock early and only drops it once
> we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the
> reclaim tag.
Some comments below as well as couple of questions based on the
difference with previous code, not necessarily pointing to a
problem. Just trying to figure it out for myself.
>
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-01 23:20:54.775080254
> +0200
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-01 23:20:54.807080483 +0200
> @@ -133,80 +133,90 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * This inode is being torn down, pause and try again.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & XFS_IRECLAIM) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> + /*
> + * If we are racing with another cache hit that is currently
> recycling
> + * this inode out of the XFS_IRECLAIMABLE state, wait for the
> + * initialisation to complete before continuing.
> + */
> + if (ip->i_flags & XFS_INEW) {
Another case when we find XFS_INEW set is the race with the
cache miss, which just set up a new inode. Would the proposed
code be still sensible in that case? If yes, at least comments
should be updated.
>
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
>
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + XFS_STATS_INC(xs_ig_frecycle);
> + wait_on_inode(inode);
It's possible to have XFS_INEW set, but no I_LOCK|I_NEW yet.
Then the wait_on_inode() would return quickly even before the
linux inode is reinitialized. Though, that was the case with
the old code as well.
>
> + return EAGAIN;
This return seems inconsistent with usual goto end of function
convention. I understand that in this case goto out_error would
be incorrect. Should we create new label out_error_unlocked?
>
> + }
>
> + /*
> + * If lookup is racing with unlink, then we should return an
> + * error immediately so we don't remove it from the reclaim
> + * list and potentially leak the inode.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
Previously the conclusion of the race with unlink was based on
XFS_IRECLAIMABLE i_flag set in addition to the test above.
Is is no longer a case, or not really necessary?
>
> + error = ENOENT;
> + goto out_error;
> + }
> +
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (!inode_init_always(mp->m_super, VFS_I(ip))) {
> + ip->i_flags |= XFS_INEW;
> + __xfs_inode_clear_reclaim_tag(pag, ip);
> +
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> +
> + if (unlikely(!inode_init_always(mp->m_super, inode))) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> +
> error = ENOMEM;
> goto out_error;
> }
> -
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> -
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + inode->i_state = I_LOCK|I_NEW;
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -216,6 +226,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-01
> 23:20:31.441330970 +0200
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-01
> 23:20:54.807080483 +0200
> @@ -708,6 +708,17 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> + ip->i_flags |= XFS_IRECLAIMABLE;
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,9 +733,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> - __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> xfs_put_perag(mp, pag);
> @@ -732,27 +741,13 @@ xfs_inode_set_reclaim_tag(
>
> void
> __xfs_inode_clear_reclaim_tag(
> - xfs_mount_t *mp,
> - xfs_perag_t *pag,
> - xfs_inode_t *ip)
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> {
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> radix_tree_tag_clear(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> -}
> -
> -void
> -xfs_inode_clear_reclaim_tag(
> - xfs_inode_t *ip)
> -{
> - xfs_mount_t *mp = ip->i_mount;
> - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
> -
> - read_lock(&pag->pag_ici_lock);
> - spin_lock(&ip->i_flags_lock);
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - spin_unlock(&ip->i_flags_lock);
> - read_unlock(&pag->pag_ici_lock);
> - xfs_put_perag(mp, pag);
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> }
>
> STATIC int
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-01
> 23:20:31.449329683 +0200
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-01
> 23:20:54.808079772 +0200
> @@ -48,9 +48,8 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> -void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct
> xfs_perag *pag,
> - struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
> +void __xfs_inode_clear_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
>
> int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
> int xfs_inode_ag_iterator(struct xfs_mount *mp,
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-07 17:25 ` Felix Blyakher
0 siblings, 0 replies; 38+ messages in thread
From: Felix Blyakher @ 2009-08-07 17:25 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs, linux-fsdevel
On Aug 4, 2009, at 9:15 AM, Christoph Hellwig wrote:
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects
> modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> - we acquire and drop i_flags_lock a lot and thus provide no
> consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the
> function.
> The new version acquires i_flags_lock early and only drops it once
> we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the
> reclaim tag.
Some comments below as well as couple of questions based on the
difference with previous code, not necessarily pointing to a
problem. Just trying to figure it out for myself.
>
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-01 23:20:54.775080254
> +0200
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-01 23:20:54.807080483 +0200
> @@ -133,80 +133,90 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * This inode is being torn down, pause and try again.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & XFS_IRECLAIM) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> + /*
> + * If we are racing with another cache hit that is currently
> recycling
> + * this inode out of the XFS_IRECLAIMABLE state, wait for the
> + * initialisation to complete before continuing.
> + */
> + if (ip->i_flags & XFS_INEW) {
Another case when we find XFS_INEW set is the race with the
cache miss, which just set up a new inode. Would the proposed
code be still sensible in that case? If yes, at least comments
should be updated.
>
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
>
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + XFS_STATS_INC(xs_ig_frecycle);
> + wait_on_inode(inode);
It's possible to have XFS_INEW set, but no I_LOCK|I_NEW yet.
Then the wait_on_inode() would return quickly even before the
linux inode is reinitialized. Though, that was the case with
the old code as well.
>
> + return EAGAIN;
This return seems inconsistent with usual goto end of function
convention. I understand that in this case goto out_error would
be incorrect. Should we create new label out_error_unlocked?
>
> + }
>
> + /*
> + * If lookup is racing with unlink, then we should return an
> + * error immediately so we don't remove it from the reclaim
> + * list and potentially leak the inode.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
Previously the conclusion of the race with unlink was based on
XFS_IRECLAIMABLE i_flag set in addition to the test above.
Is is no longer a case, or not really necessary?
>
> + error = ENOENT;
> + goto out_error;
> + }
> +
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (!inode_init_always(mp->m_super, VFS_I(ip))) {
> + ip->i_flags |= XFS_INEW;
> + __xfs_inode_clear_reclaim_tag(pag, ip);
> +
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> +
> + if (unlikely(!inode_init_always(mp->m_super, inode))) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> +
> error = ENOMEM;
> goto out_error;
> }
> -
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> -
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + inode->i_state = I_LOCK|I_NEW;
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -216,6 +226,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-01
> 23:20:31.441330970 +0200
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-01
> 23:20:54.807080483 +0200
> @@ -708,6 +708,17 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> + ip->i_flags |= XFS_IRECLAIMABLE;
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,9 +733,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> - __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> xfs_put_perag(mp, pag);
> @@ -732,27 +741,13 @@ xfs_inode_set_reclaim_tag(
>
> void
> __xfs_inode_clear_reclaim_tag(
> - xfs_mount_t *mp,
> - xfs_perag_t *pag,
> - xfs_inode_t *ip)
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> {
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> radix_tree_tag_clear(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> -}
> -
> -void
> -xfs_inode_clear_reclaim_tag(
> - xfs_inode_t *ip)
> -{
> - xfs_mount_t *mp = ip->i_mount;
> - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
> -
> - read_lock(&pag->pag_ici_lock);
> - spin_lock(&ip->i_flags_lock);
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - spin_unlock(&ip->i_flags_lock);
> - read_unlock(&pag->pag_ici_lock);
> - xfs_put_perag(mp, pag);
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> }
>
> STATIC int
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-01
> 23:20:31.449329683 +0200
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-01
> 23:20:54.808079772 +0200
> @@ -48,9 +48,8 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> -void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct
> xfs_perag *pag,
> - struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
> +void __xfs_inode_clear_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
>
> int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
> int xfs_inode_ag_iterator(struct xfs_mount *mp,
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-07 17:25 ` Felix Blyakher
@ 2009-08-10 17:09 ` Christoph Hellwig
-1 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2009-08-10 17:09 UTC (permalink / raw)
To: Felix Blyakher; +Cc: Christoph Hellwig, linux-fsdevel, xfs
On Fri, Aug 07, 2009 at 12:25:47PM -0500, Felix Blyakher wrote:
>> + if (ip->i_flags & XFS_INEW) {
>
> Another case when we find XFS_INEW set is the race with the
> cache miss, which just set up a new inode. Would the proposed
> code be still sensible in that case? If yes, at least comments
> should be updated.
>> + wait_on_inode(inode);
>
> It's possible to have XFS_INEW set, but no I_LOCK|I_NEW yet.
> Then the wait_on_inode() would return quickly even before the
> linux inode is reinitialized. Though, that was the case with
> the old code as well.
The wait_on_inode is only sensible for the non-recycle case. But it's
not actually very useful with our flags scheme, so for now I've reverted
to do the old-style polling and just added an XXX comment that we'll
eventually look into a better scheme.
>> + /*
>> + * If lookup is racing with unlink, then we should return an
>> + * error immediately so we don't remove it from the reclaim
>> + * list and potentially leak the inode.
>> + */
>> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
>
> Previously the conclusion of the race with unlink was based on
> XFS_IRECLAIMABLE i_flag set in addition to the test above.
> Is is no longer a case, or not really necessary?
We actually had the test two times in the old code, once for the
reclaim case, and once after the igrab succeeded. I just moved it into
a single command place. I've updated the comment to match that.
New version below:
--
Subject: xfs: fix locking in xfs_iget_cache_hit
From: Christoph Hellwig <hch@lst.de>
The locking in xfs_iget_cache_hit currently has numerous problems:
- we clear the reclaim tag without i_flags_lock which protects modifications
to it
- we call inode_init_always which can sleep with pag_ici_lock held
(this is oss.sgi.com BZ #819)
- we acquire and drop i_flags_lock a lot and thus provide no consistency
between the various flags we set/clear under it
This patch fixes all that with a major revamp of the locking in the function.
The new version acquires i_flags_lock early and only drops it once we need to
call into inode_init_always or before calling xfs_ilock.
This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/xfs/xfs_iget.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933 -0300
+++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300
@@ -191,80 +191,83 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
+ ip->i_flags |= XFS_INEW;
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+
+ if (unlikely(inode_init_always(mp->m_super, inode))) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+
error = ENOMEM;
goto out_error;
}
-
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -274,6 +277,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:19.146974522 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:59.958993938 -0300
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:19.153974227 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:59.962994168 -0300
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-10 17:09 ` Christoph Hellwig
0 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2009-08-10 17:09 UTC (permalink / raw)
To: Felix Blyakher; +Cc: Christoph Hellwig, linux-fsdevel, xfs
On Fri, Aug 07, 2009 at 12:25:47PM -0500, Felix Blyakher wrote:
>> + if (ip->i_flags & XFS_INEW) {
>
> Another case when we find XFS_INEW set is the race with the
> cache miss, which just set up a new inode. Would the proposed
> code be still sensible in that case? If yes, at least comments
> should be updated.
>> + wait_on_inode(inode);
>
> It's possible to have XFS_INEW set, but no I_LOCK|I_NEW yet.
> Then the wait_on_inode() would return quickly even before the
> linux inode is reinitialized. Though, that was the case with
> the old code as well.
The wait_on_inode is only sensible for the non-recycle case. But it's
not actually very useful with our flags scheme, so for now I've reverted
to do the old-style polling and just added an XXX comment that we'll
eventually look into a better scheme.
>> + /*
>> + * If lookup is racing with unlink, then we should return an
>> + * error immediately so we don't remove it from the reclaim
>> + * list and potentially leak the inode.
>> + */
>> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
>
> Previously the conclusion of the race with unlink was based on
> XFS_IRECLAIMABLE i_flag set in addition to the test above.
> Is is no longer a case, or not really necessary?
We actually had the test two times in the old code, once for the
reclaim case, and once after the igrab succeeded. I just moved it into
a single command place. I've updated the comment to match that.
New version below:
--
Subject: xfs: fix locking in xfs_iget_cache_hit
From: Christoph Hellwig <hch@lst.de>
The locking in xfs_iget_cache_hit currently has numerous problems:
- we clear the reclaim tag without i_flags_lock which protects modifications
to it
- we call inode_init_always which can sleep with pag_ici_lock held
(this is oss.sgi.com BZ #819)
- we acquire and drop i_flags_lock a lot and thus provide no consistency
between the various flags we set/clear under it
This patch fixes all that with a major revamp of the locking in the function.
The new version acquires i_flags_lock early and only drops it once we need to
call into inode_init_always or before calling xfs_ilock.
This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/xfs/xfs_iget.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933 -0300
+++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300
@@ -191,80 +191,83 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
+ ip->i_flags |= XFS_INEW;
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+
+ if (unlikely(inode_init_always(mp->m_super, inode))) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+
error = ENOMEM;
goto out_error;
}
-
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -274,6 +277,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:19.146974522 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:59.958993938 -0300
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:19.153974227 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:59.962994168 -0300
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-10 17:09 ` Christoph Hellwig
@ 2009-08-16 21:01 ` Eric Sandeen
-1 siblings, 0 replies; 38+ messages in thread
From: Eric Sandeen @ 2009-08-16 21:01 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, xfs
Christoph Hellwig wrote:
> New version below:
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> (this is oss.sgi.com BZ #819)
> - we acquire and drop i_flags_lock a lot and thus provide no consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the function.
> The new version acquires i_flags_lock early and only drops it once we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
This seems ok to me but I have to be honest, I'm having a hard time
getting my head around back into the inode lifecycle.
one comment, I wonder if it's worth capturing the actual error from
inode_init_always() vs. turning every error into ENOMEM? True, today
it's the only error we can get but why re-set it?
-Eric
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933 -0300
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300
> @@ -191,80 +191,83 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * If we are racing with another cache hit that is currently
> + * instantiating this inode or currently recycling it out of
> + * reclaimabe state, wait for the initialisation to complete
> + * before continuing.
> + *
> + * XXX(hch): eventually we should do something equivalent to
> + * wait_on_inode to wait for these flags to be cleared
> + * instead of polling for it.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> -
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + /*
> + * If lookup is racing with unlink return an error immediately.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> + error = ENOENT;
> + goto out_error;
> + }
>
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (inode_init_always(mp->m_super, VFS_I(ip))) {
> + ip->i_flags |= XFS_INEW;
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> +
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> +
> + if (unlikely(inode_init_always(mp->m_super, inode))) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + ip->i_flags |= XFS_IRECLAIMABLE;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> +
> error = ENOMEM;
> goto out_error;
> }
> -
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> -
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + inode->i_state = I_LOCK|I_NEW;
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -274,6 +277,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:19.146974522 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:59.958993938 -0300
> @@ -708,6 +708,16 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:19.153974227 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:59.962994168 -0300
> @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
> void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
> struct xfs_inode *ip);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-16 21:01 ` Eric Sandeen
0 siblings, 0 replies; 38+ messages in thread
From: Eric Sandeen @ 2009-08-16 21:01 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Felix Blyakher, linux-fsdevel, xfs
Christoph Hellwig wrote:
> New version below:
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> (this is oss.sgi.com BZ #819)
> - we acquire and drop i_flags_lock a lot and thus provide no consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the function.
> The new version acquires i_flags_lock early and only drops it once we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
This seems ok to me but I have to be honest, I'm having a hard time
getting my head around back into the inode lifecycle.
one comment, I wonder if it's worth capturing the actual error from
inode_init_always() vs. turning every error into ENOMEM? True, today
it's the only error we can get but why re-set it?
-Eric
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933 -0300
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300
> @@ -191,80 +191,83 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * If we are racing with another cache hit that is currently
> + * instantiating this inode or currently recycling it out of
> + * reclaimabe state, wait for the initialisation to complete
> + * before continuing.
> + *
> + * XXX(hch): eventually we should do something equivalent to
> + * wait_on_inode to wait for these flags to be cleared
> + * instead of polling for it.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> -
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + /*
> + * If lookup is racing with unlink return an error immediately.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> + error = ENOENT;
> + goto out_error;
> + }
>
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (inode_init_always(mp->m_super, VFS_I(ip))) {
> + ip->i_flags |= XFS_INEW;
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> +
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> +
> + if (unlikely(inode_init_always(mp->m_super, inode))) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + ip->i_flags |= XFS_IRECLAIMABLE;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> +
> error = ENOMEM;
> goto out_error;
> }
> -
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> -
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + inode->i_state = I_LOCK|I_NEW;
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -274,6 +277,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:19.146974522 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:59.958993938 -0300
> @@ -708,6 +708,16 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:19.153974227 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:59.962994168 -0300
> @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
> void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
> struct xfs_inode *ip);
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-10 17:09 ` Christoph Hellwig
@ 2009-08-16 22:54 ` Felix Blyakher
-1 siblings, 0 replies; 38+ messages in thread
From: Felix Blyakher @ 2009-08-16 22:54 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, xfs
On Aug 10, 2009, at 12:09 PM, Christoph Hellwig wrote:
> On Fri, Aug 07, 2009 at 12:25:47PM -0500, Felix Blyakher wrote:
>>> + if (ip->i_flags & XFS_INEW) {
>>
>> Another case when we find XFS_INEW set is the race with the
>> cache miss, which just set up a new inode. Would the proposed
>> code be still sensible in that case? If yes, at least comments
>> should be updated.
>
>>> + wait_on_inode(inode);
>>
>> It's possible to have XFS_INEW set, but no I_LOCK|I_NEW yet.
>> Then the wait_on_inode() would return quickly even before the
>> linux inode is reinitialized. Though, that was the case with
>> the old code as well.
>
> The wait_on_inode is only sensible for the non-recycle case.
The case, I was referring to, was indeed the reclaimable one when
the first thread is going through
xfs_iget
xfs_iget_cache_hit
if (ip->i_flags & XFS_IRECLAIMABLE) {
ip->i_flags |= XFS_INEW;
-->
xfs_setup_inode
inode->i_state = I_NEW|I_LOCK;
while another therad run through the following sequence right where the
arrow shows above:
xfs_iget_cache_hit
if (ip->i_flags & XFS_INEW) {
wait_on_inode
There is nothing to wait on here yet, as I_LOCK is not set yet.
> But it's
> not actually very useful with our flags scheme, so for now I've
> reverted
> to do the old-style polling and just added an XXX comment that we'll
> eventually look into a better scheme.
>
>>> + /*
>>> + * If lookup is racing with unlink, then we should return an
>>> + * error immediately so we don't remove it from the reclaim
>>> + * list and potentially leak the inode.
>>> + */
>>> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
>>
>> Previously the conclusion of the race with unlink was based on
>> XFS_IRECLAIMABLE i_flag set in addition to the test above.
>> Is is no longer a case, or not really necessary?
>
> We actually had the test two times in the old code, once for the
> reclaim case, and once after the igrab succeeded. I just moved it into
> a single command place.
which doesn't test for XFS_IRECLAIMABLE.
I just wanted to make sure that was the intention.
> I've updated the comment to match that.
Thanks, that will make the intention clear.
>
>
>
> New version below:
>
> --
>
> Subject: xfs: fix locking in xfs_iget_cache_hit
> From: Christoph Hellwig <hch@lst.de>
>
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects
> modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> (this is oss.sgi.com BZ #819)
> - we acquire and drop i_flags_lock a lot and thus provide no
> consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the
> function.
> The new version acquires i_flags_lock early and only drops it once
> we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the
> reclaim tag.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
>
>
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933
> -0300
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300
> @@ -191,80 +191,83 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * If we are racing with another cache hit that is currently
> + * instantiating this inode or currently recycling it out of
> + * reclaimabe state, wait for the initialisation to complete
^ l
>
> + * before continuing.
> + *
> + * XXX(hch): eventually we should do something equivalent to
> + * wait_on_inode to wait for these flags to be cleared
> + * instead of polling for it.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> -
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + /*
> + * If lookup is racing with unlink return an error immediately.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> + error = ENOENT;
> + goto out_error;
> + }
>
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (inode_init_always(mp->m_super, VFS_I(ip))) {
> + ip->i_flags |= XFS_INEW;
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> +
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> +
> + if (unlikely(inode_init_always(mp->m_super, inode))) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + ip->i_flags |= XFS_IRECLAIMABLE;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> +
> error = ENOMEM;
> goto out_error;
> }
> -
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> -
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + inode->i_state = I_LOCK|I_NEW;
Seems redundant, as that will be set one step later in
xfs_setup_inode().
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -274,6 +277,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10
> 13:10:19.146974522 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10
> 13:10:59.958993938 -0300
> @@ -708,6 +708,16 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10
> 13:10:19.153974227 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10
> 13:10:59.962994168 -0300
> @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
> void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct
> xfs_perag *pag,
> struct xfs_inode *ip);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-16 22:54 ` Felix Blyakher
0 siblings, 0 replies; 38+ messages in thread
From: Felix Blyakher @ 2009-08-16 22:54 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, xfs
On Aug 10, 2009, at 12:09 PM, Christoph Hellwig wrote:
> On Fri, Aug 07, 2009 at 12:25:47PM -0500, Felix Blyakher wrote:
>>> + if (ip->i_flags & XFS_INEW) {
>>
>> Another case when we find XFS_INEW set is the race with the
>> cache miss, which just set up a new inode. Would the proposed
>> code be still sensible in that case? If yes, at least comments
>> should be updated.
>
>>> + wait_on_inode(inode);
>>
>> It's possible to have XFS_INEW set, but no I_LOCK|I_NEW yet.
>> Then the wait_on_inode() would return quickly even before the
>> linux inode is reinitialized. Though, that was the case with
>> the old code as well.
>
> The wait_on_inode is only sensible for the non-recycle case.
The case, I was referring to, was indeed the reclaimable one when
the first thread is going through
xfs_iget
xfs_iget_cache_hit
if (ip->i_flags & XFS_IRECLAIMABLE) {
ip->i_flags |= XFS_INEW;
-->
xfs_setup_inode
inode->i_state = I_NEW|I_LOCK;
while another therad run through the following sequence right where the
arrow shows above:
xfs_iget_cache_hit
if (ip->i_flags & XFS_INEW) {
wait_on_inode
There is nothing to wait on here yet, as I_LOCK is not set yet.
> But it's
> not actually very useful with our flags scheme, so for now I've
> reverted
> to do the old-style polling and just added an XXX comment that we'll
> eventually look into a better scheme.
>
>>> + /*
>>> + * If lookup is racing with unlink, then we should return an
>>> + * error immediately so we don't remove it from the reclaim
>>> + * list and potentially leak the inode.
>>> + */
>>> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
>>
>> Previously the conclusion of the race with unlink was based on
>> XFS_IRECLAIMABLE i_flag set in addition to the test above.
>> Is is no longer a case, or not really necessary?
>
> We actually had the test two times in the old code, once for the
> reclaim case, and once after the igrab succeeded. I just moved it into
> a single command place.
which doesn't test for XFS_IRECLAIMABLE.
I just wanted to make sure that was the intention.
> I've updated the comment to match that.
Thanks, that will make the intention clear.
>
>
>
> New version below:
>
> --
>
> Subject: xfs: fix locking in xfs_iget_cache_hit
> From: Christoph Hellwig <hch@lst.de>
>
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects
> modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> (this is oss.sgi.com BZ #819)
> - we acquire and drop i_flags_lock a lot and thus provide no
> consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the
> function.
> The new version acquires i_flags_lock early and only drops it once
> we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the
> reclaim tag.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
>
>
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933
> -0300
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300
> @@ -191,80 +191,83 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * If we are racing with another cache hit that is currently
> + * instantiating this inode or currently recycling it out of
> + * reclaimabe state, wait for the initialisation to complete
^ l
>
> + * before continuing.
> + *
> + * XXX(hch): eventually we should do something equivalent to
> + * wait_on_inode to wait for these flags to be cleared
> + * instead of polling for it.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> -
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + /*
> + * If lookup is racing with unlink return an error immediately.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> + error = ENOENT;
> + goto out_error;
> + }
>
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (inode_init_always(mp->m_super, VFS_I(ip))) {
> + ip->i_flags |= XFS_INEW;
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> +
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> +
> + if (unlikely(inode_init_always(mp->m_super, inode))) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + ip->i_flags |= XFS_IRECLAIMABLE;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> +
> error = ENOMEM;
> goto out_error;
> }
> -
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> -
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + inode->i_state = I_LOCK|I_NEW;
Seems redundant, as that will be set one step later in
xfs_setup_inode().
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -274,6 +277,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10
> 13:10:19.146974522 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10
> 13:10:59.958993938 -0300
> @@ -708,6 +708,16 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10
> 13:10:19.153974227 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10
> 13:10:59.962994168 -0300
> @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
> void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct
> xfs_perag *pag,
> struct xfs_inode *ip);
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-16 22:54 ` Felix Blyakher
@ 2009-08-17 0:36 ` Christoph Hellwig
-1 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2009-08-17 0:36 UTC (permalink / raw)
To: Felix Blyakher; +Cc: Christoph Hellwig, linux-fsdevel, xfs
On Sun, Aug 16, 2009 at 05:54:35PM -0500, Felix Blyakher wrote:
>> The wait_on_inode is only sensible for the non-recycle case.
>
> The case, I was referring to, was indeed the reclaimable one when
> the first thread is going through
>
> xfs_iget
> xfs_iget_cache_hit
> if (ip->i_flags & XFS_IRECLAIMABLE) {
> ip->i_flags |= XFS_INEW;
> -->
> xfs_setup_inode
> inode->i_state = I_NEW|I_LOCK;
>
>
> while another therad run through the following sequence right where the
> arrow shows above:
>
> xfs_iget_cache_hit
> if (ip->i_flags & XFS_INEW) {
> wait_on_inode
>
> There is nothing to wait on here yet, as I_LOCK is not set yet.
Yeah. The new version should fix it.
Here's a version with the small update that Eric suggested, any chance
we could get this into 2.6.31 still?
--
Subject: xfs: fix locking in xfs_iget_cache_hit
From: Christoph Hellwig <hch@lst.de>
The locking in xfs_iget_cache_hit currently has numerous problems:
- we clear the reclaim tag without i_flags_lock which protects modifications
to it
- we call inode_init_always which can sleep with pag_ici_lock held
(this is oss.sgi.com BZ #819)
- we acquire and drop i_flags_lock a lot and thus provide no consistency
between the various flags we set/clear under it
This patch fixes all that with a major revamp of the locking in the function.
The new version acquires i_flags_lock early and only drops it once we need to
call into inode_init_always or before calling xfs_ilock.
This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Index: linux-2.6/fs/xfs/xfs_iget.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-16 20:10:25.200960533 -0300
+++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-16 20:11:30.580432781 -0300
@@ -191,80 +191,82 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
- error = ENOMEM;
- goto out_error;
- }
+ ip->i_flags |= XFS_INEW;
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ goto out_error;
+ }
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -274,6 +276,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16 20:01:14.632430664 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16 20:10:25.740968342 -0300
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16 20:01:14.640431122 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16 20:10:25.744967593 -0300
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-17 0:36 ` Christoph Hellwig
0 siblings, 0 replies; 38+ messages in thread
From: Christoph Hellwig @ 2009-08-17 0:36 UTC (permalink / raw)
To: Felix Blyakher; +Cc: Christoph Hellwig, linux-fsdevel, xfs
On Sun, Aug 16, 2009 at 05:54:35PM -0500, Felix Blyakher wrote:
>> The wait_on_inode is only sensible for the non-recycle case.
>
> The case, I was referring to, was indeed the reclaimable one when
> the first thread is going through
>
> xfs_iget
> xfs_iget_cache_hit
> if (ip->i_flags & XFS_IRECLAIMABLE) {
> ip->i_flags |= XFS_INEW;
> -->
> xfs_setup_inode
> inode->i_state = I_NEW|I_LOCK;
>
>
> while another therad run through the following sequence right where the
> arrow shows above:
>
> xfs_iget_cache_hit
> if (ip->i_flags & XFS_INEW) {
> wait_on_inode
>
> There is nothing to wait on here yet, as I_LOCK is not set yet.
Yeah. The new version should fix it.
Here's a version with the small update that Eric suggested, any chance
we could get this into 2.6.31 still?
--
Subject: xfs: fix locking in xfs_iget_cache_hit
From: Christoph Hellwig <hch@lst.de>
The locking in xfs_iget_cache_hit currently has numerous problems:
- we clear the reclaim tag without i_flags_lock which protects modifications
to it
- we call inode_init_always which can sleep with pag_ici_lock held
(this is oss.sgi.com BZ #819)
- we acquire and drop i_flags_lock a lot and thus provide no consistency
between the various flags we set/clear under it
This patch fixes all that with a major revamp of the locking in the function.
The new version acquires i_flags_lock early and only drops it once we need to
call into inode_init_always or before calling xfs_ilock.
This patch fixes a bug seen in the wild where we race modifying the reclaim tag.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Index: linux-2.6/fs/xfs/xfs_iget.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-16 20:10:25.200960533 -0300
+++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-16 20:11:30.580432781 -0300
@@ -191,80 +191,82 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
- error = ENOMEM;
- goto out_error;
- }
+ ip->i_flags |= XFS_INEW;
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ goto out_error;
+ }
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -274,6 +276,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16 20:01:14.632430664 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16 20:10:25.740968342 -0300
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16 20:01:14.640431122 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16 20:10:25.744967593 -0300
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
2009-08-17 0:36 ` Christoph Hellwig
@ 2009-08-17 3:05 ` Felix Blyakher
-1 siblings, 0 replies; 38+ messages in thread
From: Felix Blyakher @ 2009-08-17 3:05 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, xfs
On Aug 16, 2009, at 7:36 PM, Christoph Hellwig wrote:
> On Sun, Aug 16, 2009 at 05:54:35PM -0500, Felix Blyakher wrote:
>>> The wait_on_inode is only sensible for the non-recycle case.
>>
>> The case, I was referring to, was indeed the reclaimable one when
>> the first thread is going through
>>
>> xfs_iget
>> xfs_iget_cache_hit
>> if (ip->i_flags & XFS_IRECLAIMABLE) {
>> ip->i_flags |= XFS_INEW;
>> -->
>> xfs_setup_inode
>> inode->i_state = I_NEW|I_LOCK;
>>
>>
>> while another therad run through the following sequence right where
>> the
>> arrow shows above:
>>
>> xfs_iget_cache_hit
>> if (ip->i_flags & XFS_INEW) {
>> wait_on_inode
>>
>> There is nothing to wait on here yet, as I_LOCK is not set yet.
>
> Yeah. The new version should fix it.
Agree.
> Here's a version with the small update that Eric suggested, any chance
> we could get this into 2.6.31 still?
Will send the pull request tonight.
Felix
>
>
> --
>
> Subject: xfs: fix locking in xfs_iget_cache_hit
> From: Christoph Hellwig <hch@lst.de>
>
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects
> modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> (this is oss.sgi.com BZ #819)
> - we acquire and drop i_flags_lock a lot and thus provide no
> consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the
> function.
> The new version acquires i_flags_lock early and only drops it once
> we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the
> reclaim tag.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Felix Blyakher <felixb@sgi.com>
>
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-16 20:10:25.200960533
> -0300
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-16 20:11:30.580432781 -0300
> @@ -191,80 +191,82 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * If we are racing with another cache hit that is currently
> + * instantiating this inode or currently recycling it out of
> + * reclaimabe state, wait for the initialisation to complete
> + * before continuing.
> + *
> + * XXX(hch): eventually we should do something equivalent to
> + * wait_on_inode to wait for these flags to be cleared
> + * instead of polling for it.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> -
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + /*
> + * If lookup is racing with unlink return an error immediately.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> + error = ENOENT;
> + goto out_error;
> + }
>
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (inode_init_always(mp->m_super, VFS_I(ip))) {
> - error = ENOMEM;
> - goto out_error;
> - }
> + ip->i_flags |= XFS_INEW;
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
>
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
>
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + error = -inode_init_always(mp->m_super, inode);
> + if (error) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + ip->i_flags |= XFS_IRECLAIMABLE;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> + goto out_error;
> + }
> + inode->i_state = I_LOCK|I_NEW;
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -274,6 +276,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16
> 20:01:14.632430664 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16
> 20:10:25.740968342 -0300
> @@ -708,6 +708,16 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16
> 20:01:14.640431122 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16
> 20:10:25.744967593 -0300
> @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
> void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct
> xfs_perag *pag,
> struct xfs_inode *ip);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH 1/4] xfs: fix locking in xfs_iget_cache_hit
@ 2009-08-17 3:05 ` Felix Blyakher
0 siblings, 0 replies; 38+ messages in thread
From: Felix Blyakher @ 2009-08-17 3:05 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, xfs
On Aug 16, 2009, at 7:36 PM, Christoph Hellwig wrote:
> On Sun, Aug 16, 2009 at 05:54:35PM -0500, Felix Blyakher wrote:
>>> The wait_on_inode is only sensible for the non-recycle case.
>>
>> The case, I was referring to, was indeed the reclaimable one when
>> the first thread is going through
>>
>> xfs_iget
>> xfs_iget_cache_hit
>> if (ip->i_flags & XFS_IRECLAIMABLE) {
>> ip->i_flags |= XFS_INEW;
>> -->
>> xfs_setup_inode
>> inode->i_state = I_NEW|I_LOCK;
>>
>>
>> while another therad run through the following sequence right where
>> the
>> arrow shows above:
>>
>> xfs_iget_cache_hit
>> if (ip->i_flags & XFS_INEW) {
>> wait_on_inode
>>
>> There is nothing to wait on here yet, as I_LOCK is not set yet.
>
> Yeah. The new version should fix it.
Agree.
> Here's a version with the small update that Eric suggested, any chance
> we could get this into 2.6.31 still?
Will send the pull request tonight.
Felix
>
>
> --
>
> Subject: xfs: fix locking in xfs_iget_cache_hit
> From: Christoph Hellwig <hch@lst.de>
>
> The locking in xfs_iget_cache_hit currently has numerous problems:
>
> - we clear the reclaim tag without i_flags_lock which protects
> modifications
> to it
> - we call inode_init_always which can sleep with pag_ici_lock held
> (this is oss.sgi.com BZ #819)
> - we acquire and drop i_flags_lock a lot and thus provide no
> consistency
> between the various flags we set/clear under it
>
> This patch fixes all that with a major revamp of the locking in the
> function.
> The new version acquires i_flags_lock early and only drops it once
> we need to
> call into inode_init_always or before calling xfs_ilock.
>
> This patch fixes a bug seen in the wild where we race modifying the
> reclaim tag.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Felix Blyakher <felixb@sgi.com>
>
> Index: linux-2.6/fs/xfs/xfs_iget.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-16 20:10:25.200960533
> -0300
> +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-16 20:11:30.580432781 -0300
> @@ -191,80 +191,82 @@ xfs_iget_cache_hit(
> int flags,
> int lock_flags) __releases(pag->pag_ici_lock)
> {
> + struct inode *inode = VFS_I(ip);
> struct xfs_mount *mp = ip->i_mount;
> - int error = EAGAIN;
> + int error;
> +
> + spin_lock(&ip->i_flags_lock);
>
> /*
> - * If INEW is set this inode is being set up
> - * If IRECLAIM is set this inode is being torn down
> - * Pause and try again.
> + * If we are racing with another cache hit that is currently
> + * instantiating this inode or currently recycling it out of
> + * reclaimabe state, wait for the initialisation to complete
> + * before continuing.
> + *
> + * XXX(hch): eventually we should do something equivalent to
> + * wait_on_inode to wait for these flags to be cleared
> + * instead of polling for it.
> */
> - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
> + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
> XFS_STATS_INC(xs_ig_frecycle);
> + error = EAGAIN;
> goto out_error;
> }
>
> - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
> - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
> -
> - /*
> - * If lookup is racing with unlink, then we should return an
> - * error immediately so we don't remove it from the reclaim
> - * list and potentially leak the inode.
> - */
> - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - goto out_error;
> - }
> + /*
> + * If lookup is racing with unlink return an error immediately.
> + */
> + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> + error = ENOENT;
> + goto out_error;
> + }
>
> + /*
> + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
> + * Need to carefully get it back into useable state.
> + */
> + if (ip->i_flags & XFS_IRECLAIMABLE) {
> xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
>
> /*
> - * We need to re-initialise the VFS inode as it has been
> - * 'freed' by the VFS. Do this here so we can deal with
> - * errors cleanly, then tag it so it can be set up correctly
> - * later.
> + * We need to set XFS_INEW atomically with clearing the
> + * reclaimable tag so that we do have an indicator of the
> + * inode still being initialized.
> */
> - if (inode_init_always(mp->m_super, VFS_I(ip))) {
> - error = ENOMEM;
> - goto out_error;
> - }
> + ip->i_flags |= XFS_INEW;
> + ip->i_flags &= ~XFS_IRECLAIMABLE;
> + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
>
> - /*
> - * We must set the XFS_INEW flag before clearing the
> - * XFS_IRECLAIMABLE flag so that if a racing lookup does
> - * not find the XFS_IRECLAIMABLE above but has the igrab()
> - * below succeed we can safely check XFS_INEW to detect
> - * that this inode is still being initialised.
> - */
> - xfs_iflags_set(ip, XFS_INEW);
> - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
>
> - /* clear the radix tree reclaim flag as well. */
> - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
> - } else if (!igrab(VFS_I(ip))) {
> + error = -inode_init_always(mp->m_super, inode);
> + if (error) {
> + /*
> + * Re-initializing the inode failed, and we are in deep
> + * trouble. Try to re-add it to the reclaim list.
> + */
> + read_lock(&pag->pag_ici_lock);
> + spin_lock(&ip->i_flags_lock);
> +
> + ip->i_flags &= ~XFS_INEW;
> + ip->i_flags |= XFS_IRECLAIMABLE;
> + __xfs_inode_set_reclaim_tag(pag, ip);
> + goto out_error;
> + }
> + inode->i_state = I_LOCK|I_NEW;
> + } else {
> /* If the VFS inode is being torn down, pause and try again. */
> - XFS_STATS_INC(xs_ig_frecycle);
> - goto out_error;
> - } else if (xfs_iflags_test(ip, XFS_INEW)) {
> - /*
> - * We are racing with another cache hit that is
> - * currently recycling this inode out of the XFS_IRECLAIMABLE
> - * state. Wait for the initialisation to complete before
> - * continuing.
> - */
> - wait_on_inode(VFS_I(ip));
> - }
> + if (!igrab(inode)) {
> + error = EAGAIN;
> + goto out_error;
> + }
>
> - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
> - error = ENOENT;
> - iput(VFS_I(ip));
> - goto out_error;
> + /* We've got a live one. */
> + spin_unlock(&ip->i_flags_lock);
> + read_unlock(&pag->pag_ici_lock);
> }
>
> - /* We've got a live one. */
> - read_unlock(&pag->pag_ici_lock);
> -
> if (lock_flags != 0)
> xfs_ilock(ip, lock_flags);
>
> @@ -274,6 +276,7 @@ xfs_iget_cache_hit(
> return 0;
>
> out_error:
> + spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> return error;
> }
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16
> 20:01:14.632430664 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-16
> 20:10:25.740968342 -0300
> @@ -708,6 +708,16 @@ xfs_reclaim_inode(
> return 0;
> }
>
> +void
> +__xfs_inode_set_reclaim_tag(
> + struct xfs_perag *pag,
> + struct xfs_inode *ip)
> +{
> + radix_tree_tag_set(&pag->pag_ici_root,
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + XFS_ICI_RECLAIM_TAG);
> +}
> +
> /*
> * We set the inode flag atomically with the radix tree tag.
> * Once we get tag lookups on the radix tree, this inode flag
> @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
>
> read_lock(&pag->pag_ici_lock);
> spin_lock(&ip->i_flags_lock);
> - radix_tree_tag_set(&pag->pag_ici_root,
> - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> + __xfs_inode_set_reclaim_tag(pag, ip);
> __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
> spin_unlock(&ip->i_flags_lock);
> read_unlock(&pag->pag_ici_lock);
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16
> 20:01:14.640431122 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-16
> 20:10:25.744967593 -0300
> @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *
> int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>
> void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
> +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct
> xfs_inode *ip);
> void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
> void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct
> xfs_perag *pag,
> struct xfs_inode *ip);
^ permalink raw reply [flat|nested] 38+ messages in thread