linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: stable@vger.kernel.org
Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
Subject: [PATCH 02/27] xfs: use per-AG reservations for the finobt
Date: Sat,  1 Apr 2017 08:40:01 +0200	[thread overview]
Message-ID: <20170401064026.5783-3-hch@lst.de> (raw)
In-Reply-To: <20170401064026.5783-1-hch@lst.de>

commit 76d771b4cbe33c581bd6ca2710c120be51172440 upstream.

Currently we try to rely on the global reserved block pool for block
allocations for the free inode btree, but I have customer reports
(fairly complex workload, need to find an easier reproducer) where that
is not enough as the AG where we free an inode that requires a new
finobt block is entirely full.  This causes us to cancel a dirty
transaction and thus a file system shutdown.

I think the right way to guard against this is to treat the finot the same
way as the refcount btree and have a per-AG reservations for the possible
worst case size of it, and the patch below implements that.

Note that this could increase mount times with large finobt trees.  In
an ideal world we would have added a field for the number of finobt
fields to the AGI, similar to what we did for the refcount blocks.
We should do add it next time we rev the AGI or AGF format by adding
new fields.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_ag_resv.c      | 47 ++++++++++++++++++---
 fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_ialloc_btree.h |  3 ++
 fs/xfs/xfs_inode.c               | 23 +++++-----
 fs/xfs/xfs_mount.h               |  1 +
 5 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 94234bff40dc..33db69be4832 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -39,6 +39,7 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_ialloc_btree.h"
 
 /*
  * Per-AG Block Reservations
@@ -210,6 +211,9 @@ __xfs_ag_resv_init(
 	if (error) {
 		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
 				error, _RET_IP_);
+		xfs_warn(mp,
+"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
+				pag->pag_agno);
 		return error;
 	}
 
@@ -228,6 +232,8 @@ int
 xfs_ag_resv_init(
 	struct xfs_perag		*pag)
 {
+	struct xfs_mount		*mp = pag->pag_mount;
+	xfs_agnumber_t			agno = pag->pag_agno;
 	xfs_extlen_t			ask;
 	xfs_extlen_t			used;
 	int				error = 0;
@@ -236,23 +242,45 @@ xfs_ag_resv_init(
 	if (pag->pag_meta_resv.ar_asked == 0) {
 		ask = used = 0;
 
-		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
-				pag->pag_agno, &ask, &used);
+		error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
 
-		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
-				ask, used);
+		error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+				ask, used);
+		if (error) {
+			/*
+			 * Because we didn't have per-AG reservations when the
+			 * finobt feature was added we might not be able to
+			 * reserve all needed blocks.  Warn and fall back to the
+			 * old and potentially buggy code in that case, but
+			 * ensure we do have the reservation for the refcountbt.
+			 */
+			ask = used = 0;
+
+			mp->m_inotbt_nores = true;
+
+			error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
+					&used);
+			if (error)
+				goto out;
+
+			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+					ask, used);
+			if (error)
+				goto out;
+		}
 	}
 
 	/* Create the AGFL metadata reservation */
 	if (pag->pag_agfl_resv.ar_asked == 0) {
 		ask = used = 0;
 
-		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
-				&ask, &used);
+		error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
 
@@ -261,9 +289,16 @@ xfs_ag_resv_init(
 			goto out;
 	}
 
+#ifdef DEBUG
+	/* need to read in the AGF for the ASSERT below to work */
+	error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
+	if (error)
+		return error;
+
 	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
 	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
 	       pag->pagf_freeblks + pag->pagf_flcount);
+#endif
 out:
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 6c6b95947e71..b9c351ff0422 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -82,11 +82,12 @@ xfs_finobt_set_root(
 }
 
 STATIC int
-xfs_inobt_alloc_block(
+__xfs_inobt_alloc_block(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_ptr	*start,
 	union xfs_btree_ptr	*new,
-	int			*stat)
+	int			*stat,
+	enum xfs_ag_resv_type	resv)
 {
 	xfs_alloc_arg_t		args;		/* block allocation args */
 	int			error;		/* error return value */
@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
 	args.maxlen = 1;
 	args.prod = 1;
 	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.resv = resv;
 
 	error = xfs_alloc_vextent(&args);
 	if (error) {
@@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
 }
 
 STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat,
+			XFS_AG_RESV_METADATA);
+}
+
+STATIC int
 xfs_inobt_free_block(
 	struct xfs_btree_cur	*cur,
 	struct xfs_buf		*bp)
@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
 
 	.dup_cursor		= xfs_inobt_dup_cursor,
 	.set_root		= xfs_finobt_set_root,
-	.alloc_block		= xfs_inobt_alloc_block,
+	.alloc_block		= xfs_finobt_alloc_block,
 	.free_block		= xfs_inobt_free_block,
 	.get_minrecs		= xfs_inobt_get_minrecs,
 	.get_maxrecs		= xfs_inobt_get_maxrecs,
@@ -478,3 +501,64 @@ xfs_inobt_rec_check_count(
 	return 0;
 }
 #endif	/* DEBUG */
+
+static xfs_extlen_t
+xfs_inobt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_inobt_mxr[0] == 0)
+		return 0;
+
+	return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
+		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
+				XFS_INODES_PER_CHUNK);
+}
+
+static int
+xfs_inobt_count_blocks(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum,
+	xfs_extlen_t		*tree_blocks)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+	if (error)
+		return error;
+
+	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
+	error = xfs_btree_count_blocks(cur, tree_blocks);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+
+	return error;
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_finobt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	xfs_extlen_t		tree_len = 0;
+	int			error;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return 0;
+
+	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
+	if (error)
+		return error;
+
+	*ask += xfs_inobt_max_size(mp);
+	*used += tree_len;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index bd88453217ce..aa81e2e63f3f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
 #define xfs_inobt_rec_check_count(mp, rec)	0
 #endif	/* DEBUG */
 
+int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 512ff13ed66a..a1c7e138dbca 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1801,22 +1801,23 @@ xfs_inactive_ifree(
 	int			error;
 
 	/*
-	 * The ifree transaction might need to allocate blocks for record
-	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
-	 * allow ifree to dip into the reserved block pool if necessary.
-	 *
-	 * Freeing large sets of inodes generally means freeing inode chunks,
-	 * directory and file data blocks, so this should be relatively safe.
-	 * Only under severe circumstances should it be possible to free enough
-	 * inodes to exhaust the reserve block pool via finobt expansion while
-	 * at the same time not creating free space in the filesystem.
+	 * We try to use a per-AG reservation for any block needed by the finobt
+	 * tree, but as the finobt feature predates the per-AG reservation
+	 * support a degraded file system might not have enough space for the
+	 * reservation at mount time.  In that case try to dip into the reserved
+	 * pool and pray.
 	 *
 	 * Send a warning if the reservation does happen to fail, as the inode
 	 * now remains allocated and sits on the unlinked list until the fs is
 	 * repaired.
 	 */
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
-			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (unlikely(mp->m_inotbt_nores)) {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+				&tp);
+	} else {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
+	}
 	if (error) {
 		if (error == -ENOSPC) {
 			xfs_warn_ratelimited(mp,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 819b80b15bfb..1bf878b0492c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
 	__uint64_t		m_flags;	/* global mount flags */
+	bool			m_inotbt_nores; /* no per-AG finobt resv. */
 	int			m_ialloc_inos;	/* inodes in inode allocation */
 	int			m_ialloc_blks;	/* blocks in inode allocation */
 	int			m_ialloc_min_blks;/* min blocks in sparse inode
-- 
2.11.0


  parent reply	other threads:[~2017-04-01  6:40 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-01  6:39 4.9-stable updates for XFS Christoph Hellwig
2017-04-01  6:40 ` [PATCH 01/27] xfs: only update mount/resv fields on success in __xfs_ag_resv_init Christoph Hellwig
2017-04-01  6:40 ` Christoph Hellwig [this message]
2017-04-01  6:40 ` [PATCH 03/27] xfs: pull up iolock from xfs_free_eofblocks() Christoph Hellwig
2017-04-01  6:40 ` [PATCH 04/27] xfs: sync eofblocks scans under iolock are livelock prone Christoph Hellwig
2017-04-01  6:40 ` [PATCH 05/27] xfs: fix eofblocks race with file extending async dio writes Christoph Hellwig
2017-04-01  6:40 ` [PATCH 06/27] xfs: fix toctou race when locking an inode to access the data map Christoph Hellwig
2017-04-01  6:40 ` [PATCH 07/27] xfs: fail _dir_open when readahead fails Christoph Hellwig
2017-04-01  6:40 ` [PATCH 08/27] xfs: filter out obviously bad btree pointers Christoph Hellwig
2017-04-01  6:40 ` [PATCH 09/27] xfs: check for obviously bad level values in the bmbt root Christoph Hellwig
2017-04-01  6:40 ` [PATCH 10/27] xfs: verify free block header fields Christoph Hellwig
2017-04-01  6:40 ` [PATCH 11/27] xfs: allow unwritten extents in the CoW fork Christoph Hellwig
2017-04-01  6:40 ` [PATCH 12/27] xfs: mark speculative prealloc CoW fork extents unwritten Christoph Hellwig
2017-04-01  6:40 ` [PATCH 13/27] xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t Christoph Hellwig
2017-04-01  6:40 ` [PATCH 14/27] xfs: reject all unaligned direct writes to reflinked files Christoph Hellwig
2017-04-01  6:40 ` [PATCH 15/27] xfs: update ctime and mtime on clone destinatation inodes Christoph Hellwig
2017-04-01  6:40 ` [PATCH 16/27] xfs: don't fail xfs_extent_busy allocation Christoph Hellwig
2017-04-01  6:40 ` [PATCH 17/27] xfs: handle indlen shortage on delalloc extent merge Christoph Hellwig
2017-04-01  6:40 ` [PATCH 18/27] xfs: split indlen reservations fairly when under reserved Christoph Hellwig
2017-04-01  6:40 ` [PATCH 19/27] xfs: fix uninitialized variable in _reflink_convert_cow Christoph Hellwig
2017-04-01  6:40 ` [PATCH 20/27] xfs: don't reserve blocks for right shift transactions Christoph Hellwig
2017-04-01  6:40 ` [PATCH 21/27] xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment Christoph Hellwig
2017-04-01  6:40 ` [PATCH 22/27] xfs: tune down agno asserts in the bmap code Christoph Hellwig
2017-04-01  6:40 ` [PATCH 23/27] xfs: only reclaim unwritten COW extents periodically Christoph Hellwig
2017-04-01  6:40 ` [PATCH 24/27] xfs: fix and streamline error handling in xfs_end_io Christoph Hellwig
2017-04-01  6:40 ` [PATCH 25/27] xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask Christoph Hellwig
2017-04-01  6:40 ` [PATCH 26/27] xfs: use iomap new flag for newly allocated delalloc blocks Christoph Hellwig
2017-04-01  6:40 ` [PATCH 27/27] xfs: try any AG when allocating the first btree block when reflinking Christoph Hellwig
2017-04-01 17:32 ` 4.9-stable updates for XFS Greg KH
  -- strict thread matches above, loose matches on Subject: below --
2017-03-27  8:48 Christoph Hellwig
2017-03-27  8:48 ` [PATCH 02/27] xfs: use per-AG reservations for the finobt Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170401064026.5783-3-hch@lst.de \
    --to=hch@lst.de \
    --cc=darrick.wong@oracle.com \
    --cc=linux-xfs@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).