* [009/197] xfs: simplify inode teardown
[not found] <20100422191857.GA13268@kroah.com>
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [010/197] xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks Greg KH
` (16 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, Alex Elder, akpm, xfs, torvalds, Christoph Hellwig,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@infradead.org>
commit 848ce8f731aed0a2d4ab5884a4f6664af73d2dd0 upstream
Currently the reclaim code for the case where we don't reclaim the
final reclaim is overly complicated. We know that the inode is clean
but instead of just directly reclaiming the clean inode we go through
the whole process of marking the inode reclaimable just to directly
reclaim it from the calling context. Besides being overly complicated
this introduces a race where iget could recycle an inode between
marked reclaimable and actually being reclaimed leading to panics.
This patch gets rid of the existing reclaim path, and replaces it with
a simple call to xfs_ireclaim if the inode was clean. While we're at
it we also use the slightly more lax xfs_inode_clean check we'd use
later to determine if we need to flush the inode here.
Finally get rid of xfs_reclaim function and place the remaining small
bits of reclaim code directly into xfs_fs_destroy_inode.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Patrick Schreurs <patrick@news-service.com>
Reported-by: Tommy van Leeuwen <tommy@news-service.com>
Tested-by: Patrick Schreurs <patrick@news-service.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_super.c | 34 ++++++++++++++++++++++++++++++----
fs/xfs/linux-2.6/xfs_sync.c | 15 ++++-----------
fs/xfs/linux-2.6/xfs_sync.h | 1 -
fs/xfs/xfs_vnodeops.c | 40 ----------------------------------------
fs/xfs/xfs_vnodeops.h | 1 -
5 files changed, 34 insertions(+), 57 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -930,13 +930,39 @@ xfs_fs_alloc_inode(
*/
STATIC void
xfs_fs_destroy_inode(
- struct inode *inode)
+ struct inode *inode)
{
- xfs_inode_t *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ xfs_itrace_entry(ip);
XFS_STATS_INC(vn_reclaim);
- if (xfs_reclaim(ip))
- panic("%s: cannot reclaim 0x%p\n", __func__, inode);
+
+ /* bad inode, get out here ASAP */
+ if (is_bad_inode(inode))
+ goto out_reclaim;
+
+ xfs_ioend_wait(ip);
+
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+ /*
+ * We should never get here with one of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+ /*
+ * If we have nothing to flush with this inode then complete the
+ * teardown now, otherwise delay the flush operation.
+ */
+ if (!xfs_inode_clean(ip)) {
+ xfs_inode_set_reclaim_tag(ip);
+ return;
+ }
+
+out_reclaim:
+ xfs_ireclaim(ip);
}
/*
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -663,10 +663,9 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
-int
+STATIC int
xfs_reclaim_inode(
xfs_inode_t *ip,
- int locked,
int sync_mode)
{
xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
@@ -682,10 +681,6 @@ xfs_reclaim_inode(
!__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
- if (locked) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
return -EAGAIN;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
@@ -704,10 +699,8 @@ xfs_reclaim_inode(
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
- if (!locked) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
@@ -778,7 +771,7 @@ xfs_reclaim_inode_now(
}
read_unlock(&pag->pag_ici_lock);
- return xfs_reclaim_inode(ip, 0, flags);
+ return xfs_reclaim_inode(ip, flags);
}
int
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *
void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2456,46 +2456,6 @@ xfs_set_dmattrs(
return error;
}
-int
-xfs_reclaim(
- xfs_inode_t *ip)
-{
-
- xfs_itrace_entry(ip);
-
- ASSERT(!VN_MAPPED(VFS_I(ip)));
-
- /* bad inode, get out here ASAP */
- if (is_bad_inode(VFS_I(ip))) {
- xfs_ireclaim(ip);
- return 0;
- }
-
- xfs_ioend_wait(ip);
-
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-
- /*
- * If we have nothing to flush with this inode then complete the
- * teardown now, otherwise break the link between the xfs inode and the
- * linux inode and clean up the xfs inode later. This avoids flushing
- * the inode to disk during the delete operation itself.
- *
- * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
- * first to ensure that xfs_iunpin() will never see an xfs inode
- * that has a linux inode being reclaimed. Synchronisation is provided
- * by the i_flags_lock.
- */
- if (!ip->i_update_core && (ip->i_itemp == NULL)) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
- }
- xfs_inode_set_reclaim_tag(ip);
- return 0;
-}
-
/*
* xfs_alloc_file_space()
* This routine allocates disk space for the given file.
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, st
const char *target_path, mode_t mode, struct xfs_inode **ipp,
cred_t *credp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [010/197] xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks
[not found] <20100422191857.GA13268@kroah.com>
2010-04-22 19:07 ` [009/197] xfs: simplify inode teardown Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [011/197] xfs: I/O completion handlers must use NOFS allocations Greg KH
` (15 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, Alex Elder, akpm, xfs, torvalds, Christoph Hellwig,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@infradead.org>
commit c56c9631cbe88f08854a56ff9776c1f310916830 upstream
When xfs_free_eofblocks is called from ->release the VM might already
hold the mmap_sem, but in the write path we take the iolock before
taking the mmap_sem in the generic write code.
Switch xfs_free_eofblocks to only trylock the iolock if called from
->release and skip trimming the prellocated blocks in that case.
We'll still free them later on the final iput.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_rw.h | 7 -------
fs/xfs/xfs_vnodeops.c | 34 ++++++++++++++++++++++++++--------
2 files changed, 26 insertions(+), 15 deletions(-)
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_
}
/*
- * Flags for xfs_free_eofblocks
- */
-#define XFS_FREE_EOF_LOCK (1<<0)
-#define XFS_FREE_EOF_NOLOCK (1<<1)
-
-
-/*
* helper function to extract extent size hint from inode
*/
STATIC_INLINE xfs_extlen_t
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -709,6 +709,11 @@ xfs_fsync(
}
/*
+ * Flags for xfs_free_eofblocks
+ */
+#define XFS_FREE_EOF_TRYLOCK (1<<0)
+
+/*
* This is called by xfs_inactive to free any blocks beyond eof
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
@@ -726,7 +731,6 @@ xfs_free_eofblocks(
xfs_filblks_t map_len;
int nimaps;
xfs_bmbt_irec_t imap;
- int use_iolock = (flags & XFS_FREE_EOF_LOCK);
/*
* Figure out if there are any blocks beyond the end
@@ -768,14 +772,19 @@ xfs_free_eofblocks(
* cache and we can't
* do that within a transaction.
*/
- if (use_iolock)
+ if (flags & XFS_FREE_EOF_TRYLOCK) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ xfs_trans_cancel(tp, 0);
+ return 0;
+ }
+ } else {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ }
error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
ip->i_size);
if (error) {
xfs_trans_cancel(tp, 0);
- if (use_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
}
@@ -812,8 +821,7 @@ xfs_free_eofblocks(
error = xfs_trans_commit(tp,
XFS_TRANS_RELEASE_LOG_RES);
}
- xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
- : XFS_ILOCK_EXCL));
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
}
return error;
}
@@ -1113,7 +1121,17 @@ xfs_release(
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+
+ /*
+ * If we can't get the iolock just skip truncating
+ * the blocks past EOF because we could deadlock
+ * with the mmap_sem otherwise. We'll get another
+ * chance to drop them once the last reference to
+ * the inode is dropped, so we'll never leak blocks
+ * permanently.
+ */
+ error = xfs_free_eofblocks(mp, ip,
+ XFS_FREE_EOF_TRYLOCK);
if (error)
return error;
}
@@ -1184,7 +1202,7 @@ xfs_inactive(
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
(ip->i_delayed_blks != 0)))) {
- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+ error = xfs_free_eofblocks(mp, ip, 0);
if (error)
return VN_INACTIVE_CACHE;
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [011/197] xfs: I/O completion handlers must use NOFS allocations
[not found] <20100422191857.GA13268@kroah.com>
2010-04-22 19:07 ` [009/197] xfs: simplify inode teardown Greg KH
2010-04-22 19:07 ` [010/197] xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [012/197] xfs: Wrapped journal record corruption on read at recovery Greg KH
` (14 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, Alex Elder, akpm, xfs, torvalds, Christoph Hellwig,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@infradead.org>
commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb upstream
When completing I/O requests we must not allow the memory allocator to
recurse into the filesystem, as we might deadlock on waiting for the
I/O completion otherwise. The only thing currently allocating normal
GFP_KERNEL memory is the allocation of the transaction structure for
the unwritten extent conversion. Add a memflags argument to
_xfs_trans_alloc to allow controlling the allocator behaviour.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Thomas Neumann <tneumann@users.sourceforge.net>
Tested-by: Thomas Neumann <tneumann@users.sourceforge.net>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_fsops.c | 2 +-
fs/xfs/xfs_iomap.c | 9 ++++++++-
fs/xfs/xfs_mount.c | 2 +-
fs/xfs/xfs_trans.c | 7 ++++---
fs/xfs/xfs_trans.h | 2 +-
5 files changed, 15 insertions(+), 7 deletions(-)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
xfs_inode_t *ip;
int error;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
* set up a transaction to convert the range of extents
* from unwritten to real. Do allocations in a loop until
* we have covered the range passed in.
+ *
+ * Note that we open code the transaction allocation here
+ * to pass KM_NOFS--we can't risk to recursing back into
+ * the filesystem here as we might be asked to write out
+ * the same inode that we complete here and might deadlock
+ * on the iolock.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), 0,
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
return 0;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
XFS_DEFAULT_LOG_COUNT);
if (error) {
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
uint type)
{
xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
- return _xfs_trans_alloc(mp, type);
+ return _xfs_trans_alloc(mp, type, KM_SLEEP);
}
xfs_trans_t *
_xfs_trans_alloc(
xfs_mount_t *mp,
- uint type)
+ uint type,
+ uint memflags)
{
xfs_trans_t *tp;
atomic_inc(&mp->m_active_trans);
- tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+ tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
tp->t_magic = XFS_TRANS_MAGIC;
tp->t_type = type;
tp->t_mountp = mp;
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -924,7 +924,7 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces.
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
uint, uint);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [012/197] xfs: Wrapped journal record corruption on read at recovery
[not found] <20100422191857.GA13268@kroah.com>
` (2 preceding siblings ...)
2010-04-22 19:07 ` [011/197] xfs: I/O completion handlers must use NOFS allocations Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [013/197] xfs: Fix error return for fallocate() on XFS Greg KH
` (13 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: Andy Poling, xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Andy Poling <andy@realbig.com>
commit fc5bc4c85c45f0bf854404e5736aa8b65720a18d upstream
Summary of problem:
If a journal record wraps at the physical end of the journal, it has to be
read in two parts in xlog_do_recovery_pass(): a read at the physical end and a
read at the physical beginning. If xlog_bread() has to re-align the first
read, the second read request does not take that re-alignment into account.
If the first read was re-aligned, the second read over-writes the end of the
data from the first read, effectively corrupting it. This can happen either
when reading the record header or reading the record data.
The first sanity check in xlog_recover_process_data() is to check for a valid
clientid, so that is the error reported.
Summary of fix:
If there was a first read at the physical end, XFS_BUF_PTR() returns where the
data was requested to begin. Conversely, because it is the result of
xlog_align(), offset indicates where the requested data for the first read
actually begins - whether or not xlog_bread() has re-aligned it.
Using offset as the base for the calculation of where to place the second read
data ensures that it will be correctly placed immediately following the data
from the first read instead of sometimes over-writing the end of it.
The attached patch has resolved the reported problem of occasional inability
to recover the journal (reporting "bad clientid").
Signed-off-by: Andy Poling <andy@realbig.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_log_recover.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass(
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
- xfs_caddr_t bufaddr, offset;
+ xfs_caddr_t offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size;
int bblks, split_bblks;
@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass(
/*
* Check for header wrapping around physical end-of-log
*/
- offset = NULL;
+ offset = XFS_BUF_PTR(hbp);
split_hblks = 0;
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass(
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- bufaddr = XFS_BUF_PTR(hbp);
error = XFS_BUF_SET_PTR(hbp,
- bufaddr + BBTOB(split_hblks),
+ offset + BBTOB(split_hblks),
BBTOB(hblks - split_hblks));
if (error)
goto bread_err2;
@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = XFS_BUF_SET_PTR(hbp, bufaddr,
+ error = XFS_BUF_SET_PTR(hbp, offset,
BBTOB(hblks));
if (error)
goto bread_err2;
-
- if (!offset)
- offset = xlog_align(log, 0,
- wrapped_hblks, hbp);
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass(
} else {
/* This log record is split across the
* physical end of log */
- offset = NULL;
+ offset = XFS_BUF_PTR(dbp);
split_bblks = 0;
if (blk_no != log->l_logBBsize) {
/* some data is before the physical
@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
- bufaddr = XFS_BUF_PTR(dbp);
error = XFS_BUF_SET_PTR(dbp,
- bufaddr + BBTOB(split_bblks),
+ offset + BBTOB(split_bblks),
BBTOB(bblks - split_bblks));
if (error)
goto bread_err2;
@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+ error = XFS_BUF_SET_PTR(dbp, offset, h_size);
if (error)
goto bread_err2;
-
- if (!offset)
- offset = xlog_align(log, wrapped_hblks,
- bblks - split_bblks, dbp);
}
xlog_unpack_data(rhead, offset, log);
if ((error = xlog_recover_process_data(log, rhash,
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [013/197] xfs: Fix error return for fallocate() on XFS
[not found] <20100422191857.GA13268@kroah.com>
` (3 preceding siblings ...)
2010-04-22 19:07 ` [012/197] xfs: Wrapped journal record corruption on read at recovery Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [014/197] xfs: check for not fully initialized inodes in xfs_ireclaim Greg KH
` (12 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: xfs, Jason Gunthorpe, Alex Elder, akpm, torvalds, stable-review,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
commit 44a743f68705c681439f264deb05f8f38e9048d3 upstream
Noticed that through glibc fallocate would return 28 rather than -1
and errno = 28 for ENOSPC. The xfs routines uses XFS_ERROR format
positive return error codes while the syscalls use negative return
codes. Fixup the two cases in xfs_vn_fallocate syscall to convert to
negative.
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_iops.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -573,8 +573,8 @@ xfs_vn_fallocate(
bf.l_len = len;
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
- 0, XFS_ATTR_NOLOCK);
+ error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+ 0, XFS_ATTR_NOLOCK);
if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode))
new_size = offset + len;
@@ -585,7 +585,7 @@ xfs_vn_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [014/197] xfs: check for not fully initialized inodes in xfs_ireclaim
[not found] <20100422191857.GA13268@kroah.com>
` (4 preceding siblings ...)
2010-04-22 19:07 ` [013/197] xfs: Fix error return for fallocate() on XFS Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [015/197] xfs: fix timestamp handling in xfs_setattr Greg KH
` (11 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, Alex Elder, akpm, xfs, torvalds, Christoph Hellwig,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@infradead.org>
commit b44b1126279b60597f96bbe77507b1650f88a969 upstream
Add an assert for inodes not added to the inode cache in xfs_ireclaim,
to make sure we're not going to introduce something like the
famous nfsd inode cache bug again.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_iget.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -511,17 +511,21 @@ xfs_ireclaim(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
XFS_STATS_INC(xs_ig_reclaims);
/*
- * Remove the inode from the per-AG radix tree. It doesn't matter
- * if it was never added to it because radix_tree_delete can deal
- * with that case just fine.
+ * Remove the inode from the per-AG radix tree.
+ *
+ * Because radix_tree_delete won't complain even if the item was never
+ * added to the tree assert that it's been there before to catch
+ * problems with the inode life time early on.
*/
pag = xfs_get_perag(mp, ip->i_ino);
write_lock(&pag->pag_ici_lock);
- radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
+ if (!radix_tree_delete(&pag->pag_ici_root, agino))
+ ASSERT(0);
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [015/197] xfs: fix timestamp handling in xfs_setattr
[not found] <20100422191857.GA13268@kroah.com>
` (5 preceding siblings ...)
2010-04-22 19:07 ` [014/197] xfs: check for not fully initialized inodes in xfs_ireclaim Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [016/197] xfs: Dont flush stale inodes Greg KH
` (10 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, Alex Elder, akpm, xfs, torvalds, Christoph Hellwig,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@infradead.org>
commit d6d59bada372bcf8bd36c3bbc71c485c29dd2a4b upstream
We currently have some rather odd code in xfs_setattr for
updating the a/c/mtime timestamps:
- first we do a non-transaction update if all three are updated
together
- second we implicitly update the ctime for various changes
instead of relying on the ATTR_CTIME flag
- third we set the timestamps to the current time instead of the
arguments in the iattr structure in many cases.
This patch makes sure we update it in a consistent way:
- always transactional
- ctime is only updated if ATTR_CTIME is set or we do a size
update, which is a special case
- always to the times passed in from the caller instead of the
current time
The only non-size caller of xfs_setattr that doesn't come from
the VFS is updated to set ATTR_CTIME and pass in a valid ctime
value.
Reported-by: Eric Blake <ebb9@byu.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_acl.c | 3 -
fs/xfs/xfs_vnodeops.c | 93 ++++++++++++++++++---------------------------
2 files changed, 41 insertions(+), 55 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -250,8 +250,9 @@ xfs_set_mode(struct inode *inode, mode_t
if (mode != inode->i_mode) {
struct iattr iattr;
- iattr.ia_valid = ATTR_MODE;
+ iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
iattr.ia_mode = mode;
+ iattr.ia_ctime = current_fs_time(inode->i_sb);
error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
}
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -69,7 +69,6 @@ xfs_setattr(
uint commit_flags=0;
uid_t uid=0, iuid=0;
gid_t gid=0, igid=0;
- int timeflags = 0;
struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
int need_iolock = 1;
@@ -134,16 +133,13 @@ xfs_setattr(
if (flags & XFS_ATTR_NOLOCK)
need_iolock = 0;
if (!(mask & ATTR_SIZE)) {
- if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
- (mp->m_flags & XFS_MOUNT_WSYNC)) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- commit_flags = 0;
- if ((code = xfs_trans_reserve(tp, 0,
- XFS_ICHANGE_LOG_RES(mp), 0,
- 0, 0))) {
- lock_flags = 0;
- goto error_return;
- }
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ commit_flags = 0;
+ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
+ 0, 0, 0);
+ if (code) {
+ lock_flags = 0;
+ goto error_return;
}
} else {
if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
@@ -294,15 +290,23 @@ xfs_setattr(
* or we are explicitly asked to change it. This handles
* the semantic difference between truncate() and ftruncate()
* as implemented in the VFS.
+ *
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
+ * is a special case where we need to update the times despite
+ * not having these flags set. For all other operations the
+ * VFS set these flags explicitly if it wants a timestamp
+ * update.
*/
- if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
- timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+ if (iattr->ia_size != ip->i_size &&
+ (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ iattr->ia_ctime = iattr->ia_mtime =
+ current_fs_time(inode->i_sb);
+ mask |= ATTR_CTIME | ATTR_MTIME;
+ }
if (iattr->ia_size > ip->i_size) {
ip->i_d.di_size = iattr->ia_size;
ip->i_size = iattr->ia_size;
- if (!(flags & XFS_ATTR_DMI))
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
} else if (iattr->ia_size <= ip->i_size ||
(iattr->ia_size == 0 && ip->i_d.di_nextents)) {
@@ -373,9 +377,6 @@ xfs_setattr(
ip->i_d.di_gid = gid;
inode->i_gid = gid;
}
-
- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
- timeflags |= XFS_ICHGTIME_CHG;
}
/*
@@ -392,51 +393,37 @@ xfs_setattr(
inode->i_mode &= S_IFMT;
inode->i_mode |= mode & ~S_IFMT;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- timeflags |= XFS_ICHGTIME_CHG;
}
/*
* Change file access or modified times.
*/
- if (mask & (ATTR_ATIME|ATTR_MTIME)) {
- if (mask & ATTR_ATIME) {
- inode->i_atime = iattr->ia_atime;
- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- ip->i_update_core = 1;
- }
- if (mask & ATTR_MTIME) {
- inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- timeflags &= ~XFS_ICHGTIME_MOD;
- timeflags |= XFS_ICHGTIME_CHG;
- }
- if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+ if (mask & ATTR_ATIME) {
+ inode->i_atime = iattr->ia_atime;
+ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+ ip->i_update_core = 1;
}
-
- /*
- * Change file inode change time only if ATTR_CTIME set
- * AND we have been called by a DMI function.
- */
-
- if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
+ if (mask & ATTR_CTIME) {
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
ip->i_update_core = 1;
- timeflags &= ~XFS_ICHGTIME_CHG;
+ }
+ if (mask & ATTR_MTIME) {
+ inode->i_mtime = iattr->ia_mtime;
+ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ ip->i_update_core = 1;
}
/*
- * Send out timestamp changes that need to be set to the
- * current time. Not done when called by a DMI function.
+ * And finally, log the inode core if any attribute in it
+ * has been changed.
*/
- if (timeflags && !(flags & XFS_ATTR_DMI))
- xfs_ichgtime(ip, timeflags);
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
+ ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(xs_ig_attrchg);
@@ -451,12 +438,10 @@ xfs_setattr(
* mix so this probably isn't worth the trouble to optimize.
*/
code = 0;
- if (tp) {
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(tp);
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
- code = xfs_trans_commit(tp, commit_flags);
- }
+ code = xfs_trans_commit(tp, commit_flags);
xfs_iunlock(ip, lock_flags);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [016/197] xfs: Dont flush stale inodes
[not found] <20100422191857.GA13268@kroah.com>
` (6 preceding siblings ...)
2010-04-22 19:07 ` [015/197] xfs: fix timestamp handling in xfs_setattr Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [017/197] xfs: Ensure we force all busy extents in range to disk Greg KH
` (9 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit 44e08c45cc14e6190a424be8d450070c8e508fad upstream
Because inodes remain in cache much longer than inode buffers do
under memory pressure, we can get the situation where we have
stale, dirty inodes being reclaimed but the backing storage has
been freed. Hence we should never, ever flush XFS_ISTALE inodes
to disk as there is no guarantee that the backing buffer is in
cache and still marked stale when the flush occurs.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_inode.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2877,10 +2877,14 @@ xfs_iflush(
mp = ip->i_mount;
/*
- * If the inode isn't dirty, then just release the inode
- * flush lock and do nothing.
+ * If the inode isn't dirty, then just release the inode flush lock and
+ * do nothing. Treat stale inodes the same; we cannot rely on the
+ * backing buffer remaining stale in cache for the remaining life of
+ * the stale inode and so xfs_itobp() below may give us a buffer that
+ * no longer contains inodes below. Doing this stale check here also
+ * avoids forcing the log on pinned, stale inodes.
*/
- if (xfs_inode_clean(ip)) {
+ if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
xfs_ifunlock(ip);
return 0;
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [017/197] xfs: Ensure we force all busy extents in range to disk
[not found] <20100422191857.GA13268@kroah.com>
` (7 preceding siblings ...)
2010-04-22 19:07 ` [016/197] xfs: Dont flush stale inodes Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [018/197] xfs: reclaim inodes under a write lock Greg KH
` (8 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit fd45e4784164d1017521086524e3442318c67370 upstream
When we search for and find a busy extent during allocation we
force the log out to ensure the extent free transaction is on
disk before the allocation transaction. The current implementation
has a subtle bug in it--it does not handle multiple overlapping
ranges.
That is, if we free lots of little extents into a single
contiguous extent, then allocate the contiguous extent, the busy
search code stops searching at the first extent it finds that
overlaps the allocated range. It then uses the commit LSN of the
transaction to force the log out to.
Unfortunately, the other busy ranges might have more recent
commit LSNs than the first busy extent that is found, and this
results in xfs_alloc_search_busy() returning before all the
extent free transactions are on disk for the range being
allocated. This can lead to potential metadata corruption or
stale data exposure after a crash because log replay won't replay
all the extent free transactions that cover the allocation range.
Modified-by: Alex Elder <aelder@sgi.com>
(Dropped the "found" argument from the xfs_alloc_busysearch trace
event.)
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_alloc.c | 52 +++++++++++++++++++++-------------------------------
1 file changed, 21 insertions(+), 31 deletions(-)
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2703,45 +2703,35 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
xfs_mount_t *mp;
xfs_perag_busy_t *bsy;
xfs_agblock_t uend, bend;
- xfs_lsn_t lsn;
+ xfs_lsn_t lsn = 0;
int cnt;
mp = tp->t_mountp;
spin_lock(&mp->m_perag[agno].pagb_lock);
- cnt = mp->m_perag[agno].pagb_count;
-
uend = bno + len - 1;
- /* search pagb_list for this slot, skipping open slots */
- for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
-
- /*
- * (start1,length1) within (start2, length2)
- */
- if (bsy->busy_tp != NULL) {
- bend = bsy->busy_start + bsy->busy_length - 1;
- if ((bno > bend) || (uend < bsy->busy_start)) {
- cnt--;
- } else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy",
- "found1", agno, bno, len, tp);
- break;
- }
- }
- }
-
/*
- * If a block was found, force the log through the LSN of the
- * transaction that freed the block
+ * search pagb_list for this slot, skipping open slots. We have to
+ * search the entire array as there may be multiple overlaps and
+ * we have to get the most recent LSN for the log force to push out
+ * all the transactions that span the range.
*/
- if (cnt) {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
- lsn = bsy->busy_tp->t_commit_lsn;
- spin_unlock(&mp->m_perag[agno].pagb_lock);
- xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
- } else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
- spin_unlock(&mp->m_perag[agno].pagb_lock);
+ for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
+ bsy = &mp->m_perag[agno].pagb_list[cnt];
+ if (!bsy->busy_tp)
+ continue;
+ bend = bsy->busy_start + bsy->busy_length - 1;
+ if (bno > bend || uend < bsy->busy_start)
+ continue;
+
+ /* (start1,length1) within (start2, length2) */
+ if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+ lsn = bsy->busy_tp->t_commit_lsn;
}
+ spin_unlock(&mp->m_perag[agno].pagb_lock);
+ TRACE_BUSYSEARCH("xfs_alloc_search_busy", lsn ? "found" : "not-found",
+ agno, bno, len, tp);
+ if (lsn)
+ xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [018/197] xfs: reclaim inodes under a write lock
[not found] <20100422191857.GA13268@kroah.com>
` (8 preceding siblings ...)
2010-04-22 19:07 ` [017/197] xfs: Ensure we force all busy extents in range to disk Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [019/197] xfs: Avoid inodes in reclaim when flushing from inode cache Greg KH
` (7 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit c8e20be020f234c8d492927a424a7d8bbefd5b5d upstream
Make the inode tree reclaim walk exclusive to avoid races with
concurrent sync walkers and lookups. This is a version of a patch
posted by Christoph Hellwig that avoids all the code duplication.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_sync.c | 154 ++++++++++++++++++-----------------------
fs/xfs/linux-2.6/xfs_sync.h | 2
fs/xfs/quota/xfs_qm_syscalls.c | 2
3 files changed, 71 insertions(+), 87 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -64,7 +64,6 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find
* the number of objects requested.
*/
- read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1);
@@ -73,7 +72,7 @@ xfs_inode_ag_lookup(
(void **)&ip, *first_index, 1, tag);
}
if (!nr_found)
- goto unlock;
+ return NULL;
/*
* Update the index for the next lookup. Catch overflows
@@ -83,13 +82,8 @@ xfs_inode_ag_lookup(
*/
*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- goto unlock;
-
+ return NULL;
return ip;
-
-unlock:
- read_unlock(&pag->pag_ici_lock);
- return NULL;
}
STATIC int
@@ -99,7 +93,8 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int exclusive)
{
struct xfs_perag *pag = &mp->m_perag[ag];
uint32_t first_index;
@@ -113,10 +108,20 @@ restart:
int error = 0;
xfs_inode_t *ip;
+ if (exclusive)
+ write_lock(&pag->pag_ici_lock);
+ else
+ read_lock(&pag->pag_ici_lock);
ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
- if (!ip)
+ if (!ip) {
+ if (exclusive)
+ write_unlock(&pag->pag_ici_lock);
+ else
+ read_unlock(&pag->pag_ici_lock);
break;
+ }
+ /* execute releases pag->pag_ici_lock */
error = execute(ip, pag, flags);
if (error == EAGAIN) {
skipped++;
@@ -124,9 +129,8 @@ restart:
}
if (error)
last_error = error;
- /*
- * bail out if the filesystem is corrupted.
- */
+
+ /* bail out if the filesystem is corrupted. */
if (error == EFSCORRUPTED)
break;
@@ -147,7 +151,8 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int exclusive)
{
int error = 0;
int last_error = 0;
@@ -156,7 +161,8 @@ xfs_inode_ag_iterator(
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
if (!mp->m_perag[ag].pag_ici_init)
continue;
- error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+ error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+ exclusive);
if (error) {
last_error = error;
if (error == EFSCORRUPTED)
@@ -180,11 +186,7 @@ xfs_sync_inode_valid(
return EFSCORRUPTED;
}
- /*
- * If we can't get a reference on the inode, it must be in reclaim.
- * Leave it for the reclaim code to flush. Also avoid inodes that
- * haven't been fully initialised.
- */
+ /* If we can't get a reference on the inode, it must be in reclaim. */
if (!igrab(inode)) {
read_unlock(&pag->pag_ici_lock);
return ENOENT;
@@ -281,7 +283,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
if (error)
return XFS_ERROR(error);
@@ -303,7 +305,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
}
STATIC int
@@ -663,60 +665,6 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
-STATIC int
-xfs_reclaim_inode(
- xfs_inode_t *ip,
- int sync_mode)
-{
- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-
- /* The hash lock here protects a thread in xfs_iget_core from
- * racing with us on linking the inode back with a vnode.
- * Once we have the XFS_IRECLAIM flag set it will not touch
- * us.
- */
- write_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- return -EAGAIN;
- }
- __xfs_iflags_set(ip, XFS_IRECLAIM);
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- xfs_put_perag(ip->i_mount, pag);
-
- /*
- * If the inode is still dirty, then flush it out. If the inode
- * is not in the AIL, then it will be OK to flush it delwri as
- * long as xfs_iflush() does not keep any references to the inode.
- * We leave that decision up to xfs_iflush() since it has the
- * knowledge of whether it's OK to simply do a delwri flush of
- * the inode or whether we need to wait until the inode is
- * pulled from the AIL.
- * We get the flush lock regardless, though, just to make sure
- * we don't free it while it is being flushed.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
-
- /*
- * In the case of a forced shutdown we rely on xfs_iflush() to
- * wait for the inode to be unpinned before returning an error.
- */
- if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
- /* synchronize with xfs_iflush_done */
- xfs_iflock(ip);
- xfs_ifunlock(ip);
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_ireclaim(ip);
- return 0;
-}
-
void
__xfs_inode_set_reclaim_tag(
struct xfs_perag *pag,
@@ -759,19 +707,55 @@ __xfs_inode_clear_reclaim_tag(
}
STATIC int
-xfs_reclaim_inode_now(
+xfs_reclaim_inode(
struct xfs_inode *ip,
struct xfs_perag *pag,
- int flags)
+ int sync_mode)
{
- /* ignore if already under reclaim */
- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
- read_unlock(&pag->pag_ici_lock);
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* ignore as it is already under reclaim */
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
return 0;
}
- read_unlock(&pag->pag_ici_lock);
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
- return xfs_reclaim_inode(ip, flags);
+ /*
+ * If the inode is still dirty, then flush it out. If the inode
+ * is not in the AIL, then it will be OK to flush it delwri as
+ * long as xfs_iflush() does not keep any references to the inode.
+ * We leave that decision up to xfs_iflush() since it has the
+ * knowledge of whether it's OK to simply do a delwri flush of
+ * the inode or whether we need to wait until the inode is
+ * pulled from the AIL.
+ * We get the flush lock regardless, though, just to make sure
+ * we don't free it while it is being flushed.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
+
+ /*
+ * In the case of a forced shutdown we rely on xfs_iflush() to
+ * wait for the inode to be unpinned before returning an error.
+ */
+ if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+ /* synchronize with xfs_iflush_done */
+ xfs_iflock(ip);
+ xfs_ifunlock(ip);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_ireclaim(ip);
+ return 0;
}
int
@@ -779,6 +763,6 @@ xfs_reclaim_inodes(
xfs_mount_t *mp,
int mode)
{
- return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
- XFS_ICI_RECLAIM_TAG);
+ return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+ XFS_ICI_RECLAIM_TAG, 1);
}
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struc
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags, int tag);
+ int flags, int tag, int write_lock);
#endif
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -893,7 +893,7 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
}
/*------------------------------------------------------------------------*/
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [019/197] xfs: Avoid inodes in reclaim when flushing from inode cache
[not found] <20100422191857.GA13268@kroah.com>
` (9 preceding siblings ...)
2010-04-22 19:07 ` [018/197] xfs: reclaim inodes under a write lock Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [020/197] xfs: reclaim all inodes by background tree walks Greg KH
` (6 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit 018027be90a6946e8cf3f9b17b5582384f7ed117 upstream
The reclaim code will handle flushing of dirty inodes before reclaim
occurs, so avoid them when determining whether an inode is a
candidate for flushing to disk when walking the radix trees. This
is based on a test patch from Christoph Hellwig.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_sync.c | 31 ++++++++++++++++++-------------
1 file changed, 18 insertions(+), 13 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -179,26 +179,31 @@ xfs_sync_inode_valid(
struct xfs_perag *pag)
{
struct inode *inode = VFS_I(ip);
+ int error = EFSCORRUPTED;
/* nothing to sync during shutdown */
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- read_unlock(&pag->pag_ici_lock);
- return EFSCORRUPTED;
- }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ goto out_unlock;
- /* If we can't get a reference on the inode, it must be in reclaim. */
- if (!igrab(inode)) {
- read_unlock(&pag->pag_ici_lock);
- return ENOENT;
- }
- read_unlock(&pag->pag_ici_lock);
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ error = ENOENT;
+ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ goto out_unlock;
- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ goto out_unlock;
+
+ if (is_bad_inode(inode)) {
IRELE(ip);
- return ENOENT;
+ goto out_unlock;
}
- return 0;
+ /* inode is valid */
+ error = 0;
+out_unlock:
+ read_unlock(&pag->pag_ici_lock);
+ return error;
}
STATIC int
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [020/197] xfs: reclaim all inodes by background tree walks
[not found] <20100422191857.GA13268@kroah.com>
` (10 preceding siblings ...)
2010-04-22 19:07 ` [019/197] xfs: Avoid inodes in reclaim when flushing from inode cache Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [021/197] xfs: fix stale inode flush avoidance Greg KH
` (5 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit 57817c68229984818fea9e614d6f95249c3fb098 upstream
We cannot do direct inode reclaim without taking the flush lock to
ensure that we do not reclaim an inode under IO. We check the inode
is clean before doing direct reclaim, but this is not good enough
because the inode flush code marks the inode clean once it has
copied the in-core dirty state to the backing buffer.
It is the flush lock that determines whether the inode is still
under IO, even though it is marked clean, and the inode is still
required at IO completion so we can't reclaim it even though it is
clean in core. Hence the requirement that we need to take the flush
lock even on clean inodes because this guarantees that the inode
writeback IO has completed and it is safe to reclaim the inode.
With delayed write inode flushing, we could end up waiting a long
time on the flush lock even for a clean inode. The background
reclaim already handles this efficiently, so avoid all the problems
by killing the direct reclaim path altogether.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_super.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -953,16 +953,14 @@ xfs_fs_destroy_inode(
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
/*
- * If we have nothing to flush with this inode then complete the
- * teardown now, otherwise delay the flush operation.
+ * We always use background reclaim here because even if the
+ * inode is clean, it still may be under IO and hence we have
+ * to take the flush lock. The background reclaim path handles
+ * this more efficiently than we can here, so simply let background
+ * reclaim tear down all inodes.
*/
- if (!xfs_inode_clean(ip)) {
- xfs_inode_set_reclaim_tag(ip);
- return;
- }
-
out_reclaim:
- xfs_ireclaim(ip);
+ xfs_inode_set_reclaim_tag(ip);
}
/*
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [021/197] xfs: fix stale inode flush avoidance
[not found] <20100422191857.GA13268@kroah.com>
` (11 preceding siblings ...)
2010-04-22 19:07 ` [020/197] xfs: reclaim all inodes by background tree walks Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [022/197] xfs: xfs_swap_extents needs to handle dynamic fork offsets Greg KH
` (4 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit 4b6a46882cca8349e8942e2650c33b11bc571c92 upstream
When reclaiming stale inodes, we need to guarantee that inodes are
unpinned before returning with a "clean" status. If we don't we can
reclaim inodes that are pinned, leading to use after free in the
transaction subsystem as transactions complete.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_inode.c | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2878,13 +2878,9 @@ xfs_iflush(
/*
* If the inode isn't dirty, then just release the inode flush lock and
- * do nothing. Treat stale inodes the same; we cannot rely on the
- * backing buffer remaining stale in cache for the remaining life of
- * the stale inode and so xfs_itobp() below may give us a buffer that
- * no longer contains inodes below. Doing this stale check here also
- * avoids forcing the log on pinned, stale inodes.
+ * do nothing.
*/
- if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+ if (xfs_inode_clean(ip)) {
xfs_ifunlock(ip);
return 0;
}
@@ -2908,6 +2904,19 @@ xfs_iflush(
xfs_iunpin_wait(ip);
/*
+ * For stale inodes we cannot rely on the backing buffer remaining
+ * stale in cache for the remaining life of the stale inode and so
+ * xfs_itobp() below may give us a buffer that no longer contains
+ * inodes below. We have to check this after ensuring the inode is
+ * unpinned so that it is safe to reclaim the stale inode after the
+ * flush call.
+ */
+ if (xfs_iflags_test(ip, XFS_ISTALE)) {
+ xfs_ifunlock(ip);
+ return 0;
+ }
+
+ /*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this inode
* to disk, because the log record didn't make it to disk!
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [022/197] xfs: xfs_swap_extents needs to handle dynamic fork offsets
[not found] <20100422191857.GA13268@kroah.com>
` (12 preceding siblings ...)
2010-04-22 19:07 ` [021/197] xfs: fix stale inode flush avoidance Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [023/197] xfs: quota limit statvfs available blocks Greg KH
` (3 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit e09f98606dcc156de1146c209d45a0d6d5f51c3f upstream
When swapping extents, we can corrupt inodes by swapping data forks
that are in incompatible formats. This is caused by the two indoes
having different fork offsets due to the presence of an attribute
fork on an attr2 filesystem. xfs_fsr tries to be smart about
setting the fork offset, but the trick it plays only works on attr1
(old fixed format attribute fork) filesystems.
Changing the way xfs_fsr sets up the attribute fork will prevent
this situation from ever occurring, so in the kernel code we can get
by with a preventative fix - check that the data fork in the
defragmented inode is in a format valid for the inode it is being
swapped into. This will lead to files that will silently and
potentially repeatedly fail defragmentation, so issue a warning to
the log when this particular failure occurs to let us know that
xfs_fsr needs updating/fixing.
To help identify how to improve xfs_fsr to avoid this issue, add
trace points for the inodes being swapped so that we can determine
why the swap was rejected and to confirm that the code is making the
right decisions and modifications when swapping forks.
A further complication is even when the swap is allowed to proceed
when the fork offset is different between the two inodes then value
for the maximum number of extents the data fork can hold can be
wrong. Make sure these are also set correctly after the swap occurs.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_dfrag.c | 106 +++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 90 insertions(+), 16 deletions(-)
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -113,10 +113,82 @@ xfs_swapext(
return error;
}
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6. If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip) /* tmp inode */
+{
+
+ /* Should never get a local format */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ return EINVAL;
+
+ /*
+ * if the target inode has less extents that then temporary inode then
+ * why did userspace call us?
+ */
+ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ return EINVAL;
+
+ /*
+ * if the target inode is in extent form and the temp inode is in btree
+ * form then we will end up with the target inode in the wrong format
+ * as we already know there are less extents in the temp inode.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ return EINVAL;
+
+ /* Check temp in extent form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check target in extent form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check root block of temp in btree form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+
+ /* Check root block of target in btree form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ return 0;
+}
+
int
xfs_swap_extents(
- xfs_inode_t *ip,
- xfs_inode_t *tip,
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
xfs_swapext_t *sxp)
{
xfs_mount_t *mp;
@@ -160,13 +232,6 @@ xfs_swap_extents(
goto out_unlock;
}
- /* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- error = XFS_ERROR(EINVAL);
- goto out_unlock;
- }
-
if (VN_CACHED(VFS_I(tip)) != 0) {
xfs_inval_cached_trace(tip, 0, -1, 0, -1);
error = xfs_flushinval_pages(tip, 0, -1,
@@ -189,13 +254,12 @@ xfs_swap_extents(
goto out_unlock;
}
- /*
- * If the target has extended attributes, the tmp file
- * must also in order to ensure the correct data fork
- * format.
- */
- if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
- error = XFS_ERROR(EINVAL);
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_fs_cmn_err(CE_NOTE, mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __FILE__, ip->i_ino);
goto out_unlock;
}
@@ -276,6 +340,16 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
+ * Fix the in-memory data fork values that are dependent on the fork
+ * offset in the inode. We can't assume they remain the same as attr2
+ * has dynamic fork offsets.
+ */
+ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+ tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+
+ /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [023/197] xfs: quota limit statvfs available blocks
[not found] <20100422191857.GA13268@kroah.com>
` (13 preceding siblings ...)
2010-04-22 19:07 ` [022/197] xfs: xfs_swap_extents needs to handle dynamic fork offsets Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [024/197] xfs: dont hold onto reserved blocks on remount, ro Greg KH
` (2 subsequent siblings)
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, Alex Elder, akpm, xfs, torvalds, Christoph Hellwig,
alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@infradead.org>
commit 9b00f30762fe9f914eb6e03057a616ed63a4e8ca upstream
A "df" run on an NFS client of an exported XFS file system reports
the wrong information for "available" blocks. When a block quota is
enforced, the amount reported as free is limited by the quota, but
the amount reported available is not (and should be).
Reported-by: Guk-Bong, Kwon <gbkwon@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/quota/xfs_qm_bhv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
be64_to_cpu(dp->d_blk_hardlimit);
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
- statp->f_bfree =
+ statp->f_bfree = statp->f_bavail =
(statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
(statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [024/197] xfs: dont hold onto reserved blocks on remount, ro
[not found] <20100422191857.GA13268@kroah.com>
` (14 preceding siblings ...)
2010-04-22 19:07 ` [023/197] xfs: quota limit statvfs available blocks Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [025/197] xfs: remove invalid barrier optimization from xfs_fsync Greg KH
2010-04-22 19:07 ` [026/197] xfs: Non-blocking inode locking in IO completion Greg KH
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit cbe132a8bdcff0f9afd9060948fb50597c7400b8 upstream
If we hold onto reserved blocks when doing a remount,ro we end
up writing the blocks used count to disk that includes the reserved
blocks. Reserved blocks are not actually used, so this results in
the values in the superblock being incorrect.
Hence if we run xfs_check or xfs_repair -n while the filesystem is
mounted remount,ro we end up with an inconsistent filesystem being
reported. Also, running xfs_copy on the remount,ro filesystem will
result in an inconsistent image being generated.
To fix this, unreserve the blocks when doing the remount,ro, and
reserved them again on remount,rw. This way a remount,ro filesystem
will appear consistent on disk to all utilities.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_super.c | 28 ++++++++++++++++++++++++++++
fs/xfs/xfs_mount.h | 1 +
2 files changed, 29 insertions(+)
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1323,6 +1323,8 @@ xfs_fs_remount(
/* ro -> rw */
if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+ __uint64_t resblks;
+
mp->m_flags &= ~XFS_MOUNT_RDONLY;
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_mountfs_check_barriers(mp);
@@ -1340,11 +1342,37 @@ xfs_fs_remount(
}
mp->m_update_flags = 0;
}
+
+ /*
+ * Fill out the reserve pool if it is empty. Use the stashed
+ * value if it is non-zero, otherwise go with the default.
+ */
+ if (mp->m_resblks_save) {
+ resblks = mp->m_resblks_save;
+ mp->m_resblks_save = 0;
+ } else {
+ resblks = mp->m_sb.sb_dblocks;
+ do_div(resblks, 20);
+ resblks = min_t(__uint64_t, resblks, 1024);
+ }
+ xfs_reserve_blocks(mp, &resblks, NULL);
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+ /*
+ * After we have synced the data but before we sync the
+ * metadata, we need to free up the reserve block pool so that
+ * the used block count in the superblock on disk is correct at
+ * the end of the remount. Stash the current reserve pool size
+ * so that if we get remounted rw, we can return it to the same
+ * size.
+ */
+ __uint64_t resblks = 0;
+
xfs_quiesce_data(mp);
+ mp->m_resblks_save = mp->m_resblks;
+ xfs_reserve_blocks(mp, &resblks, NULL);
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -209,6 +209,7 @@ typedef struct xfs_mount {
__uint64_t m_maxioffset; /* maximum inode offset */
__uint64_t m_resblks; /* total reserved blocks */
__uint64_t m_resblks_avail;/* available reserved blocks */
+ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [025/197] xfs: remove invalid barrier optimization from xfs_fsync
[not found] <20100422191857.GA13268@kroah.com>
` (15 preceding siblings ...)
2010-04-22 19:07 ` [024/197] xfs: dont hold onto reserved blocks on remount, ro Greg KH
@ 2010-04-22 19:07 ` Greg KH
2010-04-22 19:07 ` [026/197] xfs: Non-blocking inode locking in IO completion Greg KH
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable
Cc: stable-review, akpm, xfs, torvalds, Christoph Hellwig, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Christoph Hellwig <hch@lst.de>
commit e8b217e7530c6a073ac69f1c85b922d93fdf5647 upstream
Date: Tue, 2 Feb 2010 10:16:26 +1100
We always need to flush the disk write cache and can't skip it just because
the no inode attributes have changed.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/xfs_vnodeops.c | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -597,7 +597,7 @@ xfs_fsync(
{
xfs_trans_t *tp;
int error = 0;
- int log_flushed = 0, changed = 1;
+ int log_flushed = 0;
xfs_itrace_entry(ip);
@@ -627,19 +627,11 @@ xfs_fsync(
* disk yet, the inode will be still be pinned. If it is,
* force the log.
*/
-
xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
if (xfs_ipincount(ip)) {
error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
XFS_LOG_FORCE | XFS_LOG_SYNC,
&log_flushed);
- } else {
- /*
- * If the inode is not pinned and nothing has changed
- * we don't need to flush the cache.
- */
- changed = 0;
}
} else {
/*
@@ -674,7 +666,7 @@ xfs_fsync(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
+ if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
/*
* If the log write didn't issue an ordered tag we need
* to flush the disk cache for the data device now.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread* [026/197] xfs: Non-blocking inode locking in IO completion
[not found] <20100422191857.GA13268@kroah.com>
` (16 preceding siblings ...)
2010-04-22 19:07 ` [025/197] xfs: remove invalid barrier optimization from xfs_fsync Greg KH
@ 2010-04-22 19:07 ` Greg KH
17 siblings, 0 replies; 18+ messages in thread
From: Greg KH @ 2010-04-22 19:07 UTC (permalink / raw)
To: linux-kernel, stable; +Cc: xfs, Alex Elder, akpm, torvalds, stable-review, alan
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Dave Chinner <david@fromorbit.com>
commit 77d7a0c2eeb285c9069e15396703d0cb9690ac50 upstream
The introduction of barriers to loop devices has created a new IO
order completion dependency that XFS does not handle. The loop
device implements barriers using fsync and so turns a log IO in the
XFS filesystem on the loop device into a data IO in the backing
filesystem. That is, the completion of log IOs in the loop
filesystem are now dependent on completion of data IO in the backing
filesystem.
This can cause deadlocks when a flush daemon issues a log force with
an inode locked because the IO completion of IO on the inode is
blocked by the inode lock. This in turn prevents further data IO
completion from occuring on all XFS filesystems on that CPU (due to
the shared nature of the completion queues). This then prevents the
log IO from completing because the log is waiting for data IO
completion as well.
The fix for this new completion order dependency issue is to make
the IO completion inode locking non-blocking. If the inode lock
can't be grabbed, simply requeue the IO completion back to the work
queue so that it can be processed later. This prevents the
completion queue from being blocked and allows data IO completion on
other inodes to proceed, hence avoiding completion order dependent
deadlocks.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/xfs/linux-2.6/xfs_aops.c | 118 ++++++++++++++++++++++++++++++--------------
1 file changed, 82 insertions(+), 36 deletions(-)
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -204,14 +204,17 @@ xfs_ioend_new_eof(
}
/*
- * Update on-disk file size now that data has been written to disk.
- * The current in-memory file size is i_size. If a write is beyond
- * eof i_new_size will be the intended file size until i_size is
- * updated. If this write does not extend all the way to the valid
- * file size then restrict this update to the end of the write.
+ * Update on-disk file size now that data has been written to disk. The
+ * current in-memory file size is i_size. If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated. If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
*/
-
-STATIC void
+STATIC int
xfs_setfilesize(
xfs_ioend_t *ioend)
{
@@ -222,9 +225,11 @@ xfs_setfilesize(
ASSERT(ioend->io_type != IOMAP_READ);
if (unlikely(ioend->io_error))
- return;
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return EAGAIN;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_ioend_new_eof(ioend);
if (isize) {
ip->i_d.di_size = isize;
@@ -232,6 +237,28 @@ xfs_setfilesize(
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+ xfs_ioend_t *ioend,
+ int wait)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct workqueue_struct *wq;
+
+ wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+ xfsconvertd_workqueue : xfsdatad_workqueue;
+ queue_work(wq, &ioend->io_work);
+ if (wait)
+ flush_workqueue(wq);
+ }
}
/*
@@ -243,9 +270,23 @@ xfs_end_bio_delalloc(
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
+ int error;
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ ASSERT(!error);
+ xfs_destroy_ioend(ioend);
+ }
}
/*
@@ -257,9 +298,23 @@ xfs_end_bio_written(
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
+ int error;
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ ASSERT(!error);
+ xfs_destroy_ioend(ioend);
+ }
}
/*
@@ -279,13 +334,25 @@ xfs_end_bio_unwritten(
size_t size = ioend->io_size;
if (likely(!ioend->io_error)) {
+ int error;
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- int error;
error = xfs_iomap_write_unwritten(ip, offset, size);
if (error)
ioend->io_error = error;
}
- xfs_setfilesize(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ return;
+ }
}
xfs_destroy_ioend(ioend);
}
@@ -304,27 +371,6 @@ xfs_end_bio_read(
}
/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
- xfs_ioend_t *ioend,
- int wait)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct workqueue_struct *wq = xfsdatad_workqueue;
- if (ioend->io_work.func == xfs_end_bio_unwritten)
- wq = xfsconvertd_workqueue;
-
- queue_work(wq, &ioend->io_work);
- if (wait)
- flush_workqueue(wq);
- }
-}
-
-/*
* Allocate and initialise an IO completion structure.
* We need to track unwritten extent write completion here initially.
* We'll need to extend this for updating the ondisk inode size later
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 18+ messages in thread