From: Patrick Schreurs <patrick@news-service.com>
To: Christoph Hellwig <hch@infradead.org>
Cc: Tommy van Leeuwen <tommy@news-service.com>, xfs@oss.sgi.com
Subject: Re: [PATCH] Inode reclaim fixes (was Re: 2.6.31 xfs_fs_destroy_inode: cannot reclaim)
Date: Tue, 09 Feb 2010 09:48:38 +0100 [thread overview]
Message-ID: <4B712166.9010701@news-service.com> (raw)
In-Reply-To: <20100208194226.GD9527@infradead.org>
[-- Attachment #1: Type: text/plain, Size: 551 bytes --]
On 8-2-2010 20:42, Christoph Hellwig wrote:
> On Mon, Feb 01, 2010 at 05:52:30PM +0100, Patrick Schreurs wrote:
>> Hello Dave,
>>
>> I'm afraid we had another crash a few days ago. Have a look at the
>> screenshot. Can you make anything out of it? This server is running
>> 2.6.32.3 with your inode-reclaim patch applied and XFS_DEBUG still
>> enabled.
>
> Just wondering, which set of patches is this exactly?
This is a clean 2.6.32.3 with the xfs-inode-reclaim-2.6.32 patch i
received from Dave on January 8th (see attachment).
Thanks,
-Patrick
[-- Attachment #2: xfs-inode-reclaim-2.6.32 --]
[-- Type: text/plain, Size: 12580 bytes --]
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 18a4b8e..f3c622a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -930,13 +930,37 @@ xfs_fs_alloc_inode(
*/
STATIC void
xfs_fs_destroy_inode(
- struct inode *inode)
+ struct inode *inode)
{
- xfs_inode_t *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ xfs_itrace_entry(ip);
XFS_STATS_INC(vn_reclaim);
- if (xfs_reclaim(ip))
- panic("%s: cannot reclaim 0x%p\n", __func__, inode);
+
+ /* bad inode, get out here ASAP */
+ if (is_bad_inode(inode))
+ goto out_reclaim;
+
+ xfs_ioend_wait(ip);
+
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+ /*
+ * We should never get here with one of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+ /*
+ * we always use background reclaim here because even if the
+ * inode is clean, it still may be under IO and hence we have
+ * to take the flush lock. The background reclaim path handles
+ * this more efficiently than we can here, so simply let background
+ * reclaim tear down all inodes.
+ */
+out_reclaim:
+ xfs_inode_set_reclaim_tag(ip);
}
/*
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 961df0a..897fcb5 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -54,7 +54,8 @@ xfs_inode_ag_lookup(
struct xfs_mount *mp,
struct xfs_perag *pag,
uint32_t *first_index,
- int tag)
+ int tag,
+ int write_lock)
{
int nr_found;
struct xfs_inode *ip;
@@ -64,7 +65,10 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find
* the number of objects requested.
*/
- read_lock(&pag->pag_ici_lock);
+ if (write_lock)
+ write_lock(&pag->pag_ici_lock);
+ else
+ read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1);
@@ -88,7 +92,10 @@ xfs_inode_ag_lookup(
return ip;
unlock:
- read_unlock(&pag->pag_ici_lock);
+ if (write_lock)
+ write_unlock(&pag->pag_ici_lock);
+ else
+ read_unlock(&pag->pag_ici_lock);
return NULL;
}
@@ -99,7 +106,8 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int write_lock)
{
struct xfs_perag *pag = &mp->m_perag[ag];
uint32_t first_index;
@@ -113,7 +121,8 @@ restart:
int error = 0;
xfs_inode_t *ip;
- ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+ ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag,
+ write_lock);
if (!ip)
break;
@@ -147,7 +156,8 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int write_lock)
{
int error = 0;
int last_error = 0;
@@ -156,7 +166,8 @@ xfs_inode_ag_iterator(
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
if (!mp->m_perag[ag].pag_ici_init)
continue;
- error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+ error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+ write_lock);
if (error) {
last_error = error;
if (error == EFSCORRUPTED)
@@ -180,18 +191,20 @@ xfs_sync_inode_valid(
return EFSCORRUPTED;
}
- /*
- * If we can't get a reference on the inode, it must be in reclaim.
- * Leave it for the reclaim code to flush. Also avoid inodes that
- * haven't been fully initialised.
- */
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) {
+ read_unlock(&pag->pag_ici_lock);
+ return ENOENT;
+ }
+
+ /* If we can't get a reference on the inode, it must be in reclaim. */
if (!igrab(inode)) {
read_unlock(&pag->pag_ici_lock);
return ENOENT;
}
read_unlock(&pag->pag_ici_lock);
- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+ if (is_bad_inode(inode)) {
IRELE(ip);
return ENOENT;
}
@@ -281,7 +294,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
if (error)
return XFS_ERROR(error);
@@ -303,7 +316,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
}
STATIC int
@@ -663,36 +676,11 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
-int
+STATIC int
xfs_reclaim_inode(
xfs_inode_t *ip,
- int locked,
int sync_mode)
{
- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-
- /* The hash lock here protects a thread in xfs_iget_core from
- * racing with us on linking the inode back with a vnode.
- * Once we have the XFS_IRECLAIM flag set it will not touch
- * us.
- */
- write_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- if (locked) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- return -EAGAIN;
- }
- __xfs_iflags_set(ip, XFS_IRECLAIM);
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- xfs_put_perag(ip->i_mount, pag);
-
/*
* If the inode is still dirty, then flush it out. If the inode
* is not in the AIL, then it will be OK to flush it delwri as
@@ -704,14 +692,14 @@ xfs_reclaim_inode(
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
- if (!locked) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
* wait for the inode to be unpinned before returning an error.
+ * Because we hold the flush lock, we know that the inode cannot
+ * be under IO, so if it reports clean it can be reclaimed.
*/
if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
/* synchronize with xfs_iflush_done */
@@ -771,14 +759,24 @@ xfs_reclaim_inode_now(
struct xfs_perag *pag,
int flags)
{
- /* ignore if already under reclaim */
- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
- read_unlock(&pag->pag_ici_lock);
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* ignore as it is already under reclaim */
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
return 0;
}
- read_unlock(&pag->pag_ici_lock);
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
- return xfs_reclaim_inode(ip, 0, flags);
+ return xfs_reclaim_inode(ip, flags);
}
int
@@ -787,5 +785,5 @@ xfs_reclaim_inodes(
int mode)
{
return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
- XFS_ICI_RECLAIM_TAG);
+ XFS_ICI_RECLAIM_TAG, 1);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 27920eb..ea932b4 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
@@ -55,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags, int tag);
+ int flags, int tag, int write_lock);
#endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d1a3b9..f99cfa4 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -893,7 +893,7 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
}
/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 80e5264..40e8775 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -511,17 +511,21 @@ xfs_ireclaim(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
XFS_STATS_INC(xs_ig_reclaims);
/*
- * Remove the inode from the per-AG radix tree. It doesn't matter
- * if it was never added to it because radix_tree_delete can deal
- * with that case just fine.
+ * Remove the inode from the per-AG radix tree.
+ *
+ * Because radix_tree_delete won't complain even if the item was never
+ * added to the tree assert that it's been there before to catch
+ * problems with the inode life time early on.
*/
pag = xfs_get_perag(mp, ip->i_ino);
write_lock(&pag->pag_ici_lock);
- radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
+ if (!radix_tree_delete(&pag->pag_ici_root, agino))
+ ASSERT(0);
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b92a4fa..13d7d21 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2877,10 +2877,14 @@ xfs_iflush(
mp = ip->i_mount;
/*
- * If the inode isn't dirty, then just release the inode
- * flush lock and do nothing.
+ * If the inode isn't dirty, then just release the inode flush lock and
+ * do nothing. Treat stale inodes the same; we cannot rely on the
+ * backing buffer remaining stale in cache for the remaining life of
+ * the stale inode and so xfs_itobp() below may give us a buffer that
+ * no longer contains inodes below. Doing this stale check here also
+ * avoids forcing the log on pinned, stale inodes.
*/
- if (xfs_inode_clean(ip)) {
+ if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
xfs_ifunlock(ip);
return 0;
}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b572f7e..3fac146 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2456,46 +2456,6 @@ xfs_set_dmattrs(
return error;
}
-int
-xfs_reclaim(
- xfs_inode_t *ip)
-{
-
- xfs_itrace_entry(ip);
-
- ASSERT(!VN_MAPPED(VFS_I(ip)));
-
- /* bad inode, get out here ASAP */
- if (is_bad_inode(VFS_I(ip))) {
- xfs_ireclaim(ip);
- return 0;
- }
-
- xfs_ioend_wait(ip);
-
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-
- /*
- * If we have nothing to flush with this inode then complete the
- * teardown now, otherwise break the link between the xfs inode and the
- * linux inode and clean up the xfs inode later. This avoids flushing
- * the inode to disk during the delete operation itself.
- *
- * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
- * first to ensure that xfs_iunpin() will never see an xfs inode
- * that has a linux inode being reclaimed. Synchronisation is provided
- * by the i_flags_lock.
- */
- if (!ip->i_update_core && (ip->i_itemp == NULL)) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
- }
- xfs_inode_set_reclaim_tag(ip);
- return 0;
-}
-
/*
* xfs_alloc_file_space()
* This routine allocates disk space for the given file.
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index a9e102d..167a467 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, mode_t mode, struct xfs_inode **ipp,
cred_t *credp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
[-- Attachment #3: Type: text/plain, Size: 121 bytes --]
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
next prev parent reply other threads:[~2010-02-09 8:47 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-16 10:27 2.6.31 xfs_fs_destroy_inode: cannot reclaim Tommy van Leeuwen
2009-09-17 18:59 ` Christoph Hellwig
2009-09-29 10:15 ` Patrick Schreurs
2009-09-29 12:57 ` Christoph Hellwig
2009-09-30 10:48 ` Patrick Schreurs
2009-09-30 12:41 ` Christoph Hellwig
2009-10-02 14:24 ` Bas Couwenberg
2009-10-05 21:43 ` Christoph Hellwig
2009-10-06 9:04 ` Patrick Schreurs
2009-10-07 1:19 ` Christoph Hellwig
2009-10-08 8:45 ` Patrick Schreurs
2009-10-11 7:43 ` Patrick Schreurs
2009-10-11 12:24 ` Christoph Hellwig
2009-10-12 23:38 ` Christoph Hellwig
2009-10-15 15:06 ` Tommy van Leeuwen
2009-10-18 23:59 ` Christoph Hellwig
2009-10-19 1:17 ` Dave Chinner
2009-10-19 3:53 ` Christoph Hellwig
2009-10-19 1:16 ` Dave Chinner
2009-10-19 3:54 ` Christoph Hellwig
2009-10-20 3:40 ` Dave Chinner
2009-10-21 9:45 ` Tommy van Leeuwen
2009-10-22 8:59 ` Christoph Hellwig
2009-10-27 10:41 ` Tommy van Leeuwen
[not found] ` <89c4f90c0910280519k759230c1r7b1586932ac792f7@mail.gmail.com>
2009-10-30 10:16 ` Christoph Hellwig
2009-11-03 14:46 ` Patrick Schreurs
2009-11-14 16:21 ` Christoph Hellwig
[not found] ` <4B0A8075.8080008@news-service.com>
[not found] ` <20091211115932.GA20632@infradead.org>
[not found] ` <4B3F9F88.9030307@news-service.com>
[not found] ` <20100107110446.GA13802@discord.disaster>
[not found] ` <4B45CFAC.4000607@news-service.com>
2010-01-08 11:31 ` [PATCH] Inode reclaim fixes (was Re: 2.6.31 xfs_fs_destroy_inode: cannot reclaim) Dave Chinner
2010-01-11 20:22 ` Patrick Schreurs
2010-01-15 11:01 ` Patrick Schreurs
2010-02-01 16:52 ` Patrick Schreurs
2010-02-08 10:16 ` Patrick Schreurs
2010-02-08 19:42 ` Christoph Hellwig
2010-02-09 8:48 ` Patrick Schreurs [this message]
2010-02-09 10:31 ` Christoph Hellwig
2010-02-10 12:42 ` Patrick Schreurs
2010-02-10 14:55 ` Christoph Hellwig
2010-02-10 15:42 ` Patrick Schreurs
2010-02-10 15:47 ` Christoph Hellwig
2010-02-24 18:30 ` Patrick Schreurs
2010-02-25 23:45 ` Dave Chinner
2010-03-01 9:51 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4B712166.9010701@news-service.com \
--to=patrick@news-service.com \
--cc=hch@infradead.org \
--cc=tommy@news-service.com \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox