[PATCH 4.9 33/51] xfs: bump up reserved blocks in xfs_alloc_set

linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 4.9 33/51] xfs: bump up reserved blocks in xfs_alloc_set_aside
       [not found] <20170202183345.067336143@linuxfoundation.org>
@ 2017-02-02 18:37 ` Greg Kroah-Hartman
  2017-02-02 18:37 ` [PATCH 4.9 34/51] xfs: fix bogus minleft manipulations Greg Kroah-Hartman
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Brian Foster

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit 5149fd327f16e393c1d04fa5325ab072c32472bf upstream.

Setting aside 4 blocks globally for bmbt splits isn't all that useful,
as different threads can allocate space in parallel.  Bump it to 4
blocks per AG to allow each thread that is currently doing an
allocation to dip into it separately.  Without that we may no have
enough reserved blocks if there are enough parallel transactions
in an almost out space file system that all run into bmap btree
splits.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_alloc.c |    5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -95,10 +95,7 @@ unsigned int
 xfs_alloc_set_aside(
 	struct xfs_mount	*mp)
 {
-	unsigned int		blocks;
-
-	blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
-	return blocks;
+	return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
 }
 
 /*



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 34/51] xfs: fix bogus minleft manipulations
       [not found] <20170202183345.067336143@linuxfoundation.org>
  2017-02-02 18:37 ` [PATCH 4.9 33/51] xfs: bump up reserved blocks in xfs_alloc_set_aside Greg Kroah-Hartman
@ 2017-02-02 18:37 ` Greg Kroah-Hartman
  2017-02-02 18:37 ` [PATCH 4.9 35/51] xfs: adjust allocation length in xfs_alloc_space_available Greg Kroah-Hartman
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Brian Foster

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit 255c516278175a6dc7037d1406307f35237d8688 upstream.

We can't just set minleft to 0 when we're low on space - that's exactly
what we need minleft for: to protect space in the AG for btree block
allocations when we are low on free space.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_alloc.c      |   24 +++++++-----------------
 fs/xfs/libxfs/xfs_bmap.c       |    3 ---
 fs/xfs/libxfs/xfs_bmap_btree.c |    3 +--
 3 files changed, 8 insertions(+), 22 deletions(-)

--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2635,12 +2635,10 @@ xfs_alloc_vextent(
 	xfs_agblock_t	agsize;	/* allocation group size */
 	int		error;
 	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
-	xfs_extlen_t	minleft;/* minimum left value, temp copy */
 	xfs_mount_t	*mp;	/* mount structure pointer */
 	xfs_agnumber_t	sagno;	/* starting allocation group number */
 	xfs_alloctype_t	type;	/* input allocation type */
 	int		bump_rotor = 0;
-	int		no_min = 0;
 	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
 
 	mp = args->mp;
@@ -2669,7 +2667,6 @@ xfs_alloc_vextent(
 		trace_xfs_alloc_vextent_badargs(args);
 		return 0;
 	}
-	minleft = args->minleft;
 
 	switch (type) {
 	case XFS_ALLOCTYPE_THIS_AG:
@@ -2680,9 +2677,7 @@ xfs_alloc_vextent(
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
 		args->pag = xfs_perag_get(mp, args->agno);
-		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
-		args->minleft = minleft;
 		if (error) {
 			trace_xfs_alloc_vextent_nofix(args);
 			goto error0;
@@ -2747,9 +2742,7 @@ xfs_alloc_vextent(
 		 */
 		for (;;) {
 			args->pag = xfs_perag_get(mp, args->agno);
-			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
-			args->minleft = minleft;
 			if (error) {
 				trace_xfs_alloc_vextent_nofix(args);
 				goto error0;
@@ -2789,20 +2782,17 @@ xfs_alloc_vextent(
 			 * or switch to non-trylock mode.
 			 */
 			if (args->agno == sagno) {
-				if (no_min == 1) {
+				if (flags == 0) {
 					args->agbno = NULLAGBLOCK;
 					trace_xfs_alloc_vextent_allfailed(args);
 					break;
 				}
-				if (flags == 0) {
-					no_min = 1;
-				} else {
-					flags = 0;
-					if (type == XFS_ALLOCTYPE_START_BNO) {
-						args->agbno = XFS_FSB_TO_AGBNO(mp,
-							args->fsbno);
-						args->type = XFS_ALLOCTYPE_NEAR_BNO;
-					}
+
+				flags = 0;
+				if (type == XFS_ALLOCTYPE_START_BNO) {
+					args->agbno = XFS_FSB_TO_AGBNO(mp,
+						args->fsbno);
+					args->type = XFS_ALLOCTYPE_NEAR_BNO;
 				}
 			}
 			xfs_perag_put(args->pag);
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3903,7 +3903,6 @@ xfs_bmap_btalloc(
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
 		args.total = ap->minlen;
-		args.minleft = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 		ap->dfops->dop_low = true;
@@ -4437,8 +4436,6 @@ xfs_bmapi_allocate(
 	if (error)
 		return error;
 
-	if (bma->dfops->dop_low)
-		bma->minleft = 0;
 	if (bma->cur)
 		bma->cur->bc_private.b.firstblock = *bma->firstblock;
 	if (bma->blkno == NULLFSBLOCK)
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -502,12 +502,11 @@ try_another_ag:
 	if (args.fsbno == NULLFSBLOCK && args.minleft) {
 		/*
 		 * Could not find an AG with enough free space to satisfy
-		 * a full btree split.  Try again without minleft and if
+		 * a full btree split.  Try again and if
 		 * successful activate the lowspace algorithm.
 		 */
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.minleft = 0;
 		error = xfs_alloc_vextent(&args);
 		if (error)
 			goto error0;



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 35/51] xfs: adjust allocation length in xfs_alloc_space_available
       [not found] <20170202183345.067336143@linuxfoundation.org>
  2017-02-02 18:37 ` [PATCH 4.9 33/51] xfs: bump up reserved blocks in xfs_alloc_set_aside Greg Kroah-Hartman
  2017-02-02 18:37 ` [PATCH 4.9 34/51] xfs: fix bogus minleft manipulations Greg Kroah-Hartman
@ 2017-02-02 18:37 ` Greg Kroah-Hartman
  2017-02-02 18:37 ` [PATCH 4.9 36/51] xfs: dont rely on ->total " Greg Kroah-Hartman
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Brian Foster

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit 54fee133ad59c87ab01dd84ab3e9397134b32acb upstream.

We must decide in xfs_alloc_fix_freelist if we can perform an
allocation from a given AG is possible or not based on the available
space, and should not fail the allocation past that point on a
healthy file system.

But currently we have two additional places that second-guess
xfs_alloc_fix_freelist: xfs_alloc_ag_vextent tries to adjust the
maxlen parameter to remove the reservation before doing the
allocation (but ignores the various minium freespace requirements),
and xfs_alloc_fix_minleft tries to fix up the allocated length
after we've found an extent, but ignores the reservations and also
doesn't take the AGFL into account (and thus fails allocations
for not matching minlen in some cases).

Remove all these later fixups and just correct the maxlen argument
inside xfs_alloc_fix_freelist once we have the AGF buffer locked.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_alloc.c |   81 +++++++++-------------------------------------
 fs/xfs/libxfs/xfs_alloc.h |    2 -
 2 files changed, 18 insertions(+), 65 deletions(-)

--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -362,36 +362,12 @@ xfs_alloc_fix_len(
 		return;
 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
 	ASSERT(rlen % args->prod == args->mod);
+	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+		rlen + args->minleft);
 	args->len = rlen;
 }
 
 /*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_agf_t	*agf;		/* a.g. freelist header */
-	int		diff;		/* free space difference */
-
-	if (args->minleft == 0)
-		return 1;
-	agf = XFS_BUF_TO_AGF(args->agbp);
-	diff = be32_to_cpu(agf->agf_freeblks)
-		- args->len - args->minleft;
-	if (diff >= 0)
-		return 1;
-	args->len += diff;		/* shrink the allocated space */
-	/* casts to (int) catch length underflows */
-	if ((int)args->len >= (int)args->minlen)
-		return 1;
-	args->agbno = NULLAGBLOCK;
-	return 0;
-}
-
-/*
  * Update the two btrees, logically removing from freespace the extent
  * starting at rbno, rlen blocks.  The extent is contained within the
  * actual (current) free extent fbno for flen blocks.
@@ -686,8 +662,6 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
-	xfs_extlen_t	reservation;
-	xfs_extlen_t	oldmax;
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
@@ -696,20 +670,6 @@ xfs_alloc_ag_vextent(
 	ASSERT(args->alignment > 0);
 
 	/*
-	 * Clamp maxlen to the amount of free space minus any reservations
-	 * that have been made.
-	 */
-	oldmax = args->maxlen;
-	reservation = xfs_ag_resv_needed(args->pag, args->resv);
-	if (args->maxlen > args->pag->pagf_freeblks - reservation)
-		args->maxlen = args->pag->pagf_freeblks - reservation;
-	if (args->maxlen == 0) {
-		args->agbno = NULLAGBLOCK;
-		args->maxlen = oldmax;
-		return 0;
-	}
-
-	/*
 	 * Branch to correct routine based on the type.
 	 */
 	args->wasfromfl = 0;
@@ -728,8 +688,6 @@ xfs_alloc_ag_vextent(
 		/* NOTREACHED */
 	}
 
-	args->maxlen = oldmax;
-
 	if (error || args->agbno == NULLAGBLOCK)
 		return error;
 
@@ -838,9 +796,6 @@ xfs_alloc_ag_vextent_exact(
 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
 						- args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args))
-		goto not_found;
-
 	ASSERT(args->agbno + args->len <= tend);
 
 	/*
@@ -1146,12 +1101,7 @@ restart:
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 		args->len = blen;
-		if (!xfs_alloc_fix_minleft(args)) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_near_nominleft(args);
-			return 0;
-		}
-		blen = args->len;
+
 		/*
 		 * We are allocating starting at bnew for blen blocks.
 		 */
@@ -1343,12 +1293,6 @@ restart:
 	 */
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		trace_xfs_alloc_near_nominleft(args);
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
 	rlen = args->len;
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
 				     args->datatype, ltbnoa, ltlena, &ltnew);
@@ -1550,8 +1494,6 @@ restart:
 	}
 	xfs_alloc_fix_len(args);
 
-	if (!xfs_alloc_fix_minleft(args))
-		goto out_nominleft;
 	rlen = args->len;
 	XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
 	/*
@@ -2070,10 +2012,20 @@ xfs_alloc_space_available(
 
 	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-			  reservation - min_free - args->total);
-	if (available < (int)args->minleft || available <= 0)
+			  reservation - min_free - args->minleft);
+	if (available < (int)args->total)
 		return false;
 
+	/*
+	 * Clamp maxlen to the amount of free space available for the actual
+	 * extent allocation.
+	 */
+	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+		args->maxlen = available;
+		ASSERT(args->maxlen > 0);
+		ASSERT(args->maxlen >= args->minlen);
+	}
+
 	return true;
 }
 
@@ -2119,7 +2071,8 @@ xfs_alloc_fix_freelist(
 	}
 
 	need = xfs_alloc_min_freelist(mp, pag);
-	if (!xfs_alloc_space_available(args, need, flags))
+	if (!xfs_alloc_space_available(args, need, flags |
+			XFS_ALLOC_FLAG_CHECK))
 		goto out_agbp_relse;
 
 	/*
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t;
 #define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 #define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
 #define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
-
+#define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
 
 /*
  * Argument structure for xfs_alloc routines.



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 36/51] xfs: dont rely on ->total in xfs_alloc_space_available
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (2 preceding siblings ...)
  2017-02-02 18:37 ` [PATCH 4.9 35/51] xfs: adjust allocation length in xfs_alloc_space_available Greg Kroah-Hartman
@ 2017-02-02 18:37 ` Greg Kroah-Hartman
  2017-02-02 18:37 ` [PATCH 4.9 37/51] xfs: dont print warnings when xfs_log_force fails Greg Kroah-Hartman
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Brian Foster

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit 12ef830198b0d71668eb9b59f9ba69d32951a48a upstream.

->total is a bit of an odd parameter passed down to the low-level
allocator all the way from the high-level callers.  It's supposed to
contain the maximum number of blocks to be allocated for the whole
transaction [1].

But in xfs_iomap_write_allocate we only convert existing delayed
allocations and thus only have a minimal block reservation for the
current transaction, so xfs_alloc_space_available can't use it for
the allocation decisions.  Use the maximum of args->total and the
calculated block requirement to make a decision.  We probably should
get rid of args->total eventually and instead apply ->minleft more
broadly, but that will require some extensive changes all over.

[1] which creates lots of confusion as most callers don't decrement it
once doing a first allocation.  But that's for a separate series.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_alloc.c |    7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1995,7 +1995,7 @@ xfs_alloc_space_available(
 	int			flags)
 {
 	struct xfs_perag	*pag = args->pag;
-	xfs_extlen_t		longest;
+	xfs_extlen_t		alloc_len, longest;
 	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
 
@@ -2005,15 +2005,16 @@ xfs_alloc_space_available(
 	reservation = xfs_ag_resv_needed(pag, args->resv);
 
 	/* do we have enough contiguous free space for the allocation? */
+	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
 	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
 			reservation);
-	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+	if (longest < alloc_len)
 		return false;
 
 	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
 			  reservation - min_free - args->minleft);
-	if (available < (int)args->total)
+	if (available < (int)max(args->total, alloc_len))
 		return false;
 
 	/*



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 37/51] xfs: dont print warnings when xfs_log_force fails
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (3 preceding siblings ...)
  2017-02-02 18:37 ` [PATCH 4.9 36/51] xfs: dont rely on ->total " Greg Kroah-Hartman
@ 2017-02-02 18:37 ` Greg Kroah-Hartman
  2017-02-02 18:38 ` [PATCH 4.9 46/51] xfs: fix COW writeback race Greg Kroah-Hartman
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Carlos Maiolino

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit 84a4620cfe97c9d57e39b2369bfb77faff55063d upstream.

There are only two reasons for xfs_log_force / xfs_log_force_lsn to fail:
one is an I/O error, for which xlog_bdstrat already logs a warning, and
the second is an already shutdown log due to a previous I/O errors.  In
the latter case we'll already have a previous indication for the actual
error, but the large stream of misleading warnings from xfs_log_force
will probably scroll it out of the message buffer.

Simply removing the warnings thus makes the XFS log reporting significantly
better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log.c |   12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3324,12 +3324,8 @@ xfs_log_force(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int	error;
-
 	trace_xfs_log_force(mp, 0, _RET_IP_);
-	error = _xfs_log_force(mp, flags, NULL);
-	if (error)
-		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+	_xfs_log_force(mp, flags, NULL);
 }
 
 /*
@@ -3473,12 +3469,8 @@ xfs_log_force_lsn(
 	xfs_lsn_t	lsn,
 	uint		flags)
 {
-	int	error;
-
 	trace_xfs_log_force(mp, lsn, _RET_IP_);
-	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-	if (error)
-		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+	_xfs_log_force_lsn(mp, lsn, flags, NULL);
 }
 
 /*



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 46/51] xfs: fix COW writeback race
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (4 preceding siblings ...)
  2017-02-02 18:37 ` [PATCH 4.9 37/51] xfs: dont print warnings when xfs_log_force fails Greg Kroah-Hartman
@ 2017-02-02 18:38 ` Greg Kroah-Hartman
  2017-02-02 18:38 ` [PATCH 4.9 47/51] xfs: verify dirblocklog correctly Greg Kroah-Hartman
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Brian Foster

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit d2b3964a0780d2d2994eba57f950d6c9fe489ed8 upstream.

Due to the way how xfs_iomap_write_allocate tries to convert the whole
found extents from delalloc to real space we can run into a race
condition with multiple threads doing writes to this same extent.
For the non-COW case that is harmless as the only thing that can happen
is that we call xfs_bmapi_write on an extent that has already been
converted to a real allocation.  For COW writes where we move the extent
from the COW to the data fork after I/O completion the race is, however,
not quite as harmless.  In the worst case we are now calling
xfs_bmapi_write on a region that contains hole in the COW work, which
will trip up an assert in debug builds or lead to file system corruption
in non-debug builds.  This seems to be reproducible with workloads of
small O_DSYNC write, although so far I've not managed to come up with
a with an isolated reproducer.

The fix for the issue is relatively simple:  tell xfs_bmapi_write
that we are only asked to convert delayed allocations and skip holes
in that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   44 ++++++++++++++++++++++++++++++++------------
 fs/xfs/libxfs/xfs_bmap.h |    6 +++++-
 fs/xfs/xfs_iomap.c       |    2 +-
 3 files changed, 38 insertions(+), 14 deletions(-)

--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4607,8 +4607,6 @@ xfs_bmapi_write(
 	int			n;		/* current extent index */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
 	int			whichfork;	/* data or attr fork */
-	char			inhole;		/* current location is hole in file */
-	char			wasdelay;	/* old extent was delayed */
 
 #ifdef DEBUG
 	xfs_fileoff_t		orig_bno;	/* original block number value */
@@ -4694,22 +4692,44 @@ xfs_bmapi_write(
 	bma.firstblock = firstblock;
 
 	while (bno < end && n < *nmap) {
-		inhole = eof || bma.got.br_startoff > bno;
-		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+		bool			need_alloc = false, wasdelay = false;
 
-		/*
-		 * Make sure we only reflink into a hole.
-		 */
-		if (flags & XFS_BMAPI_REMAP)
-			ASSERT(inhole);
-		if (flags & XFS_BMAPI_COWFORK)
-			ASSERT(!inhole);
+		/* in hole or beyoned EOF? */
+		if (eof || bma.got.br_startoff > bno) {
+			if (flags & XFS_BMAPI_DELALLOC) {
+				/*
+				 * For the COW fork we can reasonably get a
+				 * request for converting an extent that races
+				 * with other threads already having converted
+				 * part of it, as there converting COW to
+				 * regular blocks is not protected using the
+				 * IOLOCK.
+				 */
+				ASSERT(flags & XFS_BMAPI_COWFORK);
+				if (!(flags & XFS_BMAPI_COWFORK)) {
+					error = -EIO;
+					goto error0;
+				}
+
+				if (eof || bno >= end)
+					break;
+			} else {
+				need_alloc = true;
+			}
+		} else {
+			/*
+			 * Make sure we only reflink into a hole.
+			 */
+			ASSERT(!(flags & XFS_BMAPI_REMAP));
+			if (isnullstartblock(bma.got.br_startblock))
+				wasdelay = true;
+		}
 
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if (inhole || wasdelay) {
+		if (need_alloc || wasdelay) {
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -110,6 +110,9 @@ struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK	0x200
 
+/* Only convert delalloc space, don't allocate entirely new extents */
+#define XFS_BMAPI_DELALLOC	0x400
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -120,7 +123,8 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
-	{ XFS_BMAPI_COWFORK,	"COWFORK" }
+	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
+	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }
 
 
 static inline int xfs_bmapi_aflag(int w)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -681,7 +681,7 @@ xfs_iomap_write_allocate(
 	xfs_trans_t	*tp;
 	int		nimaps;
 	int		error = 0;
-	int		flags = 0;
+	int		flags = XFS_BMAPI_DELALLOC;
 	int		nres;
 
 	if (whichfork == XFS_COW_FORK)



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 47/51] xfs: verify dirblocklog correctly
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (5 preceding siblings ...)
  2017-02-02 18:38 ` [PATCH 4.9 46/51] xfs: fix COW writeback race Greg Kroah-Hartman
@ 2017-02-02 18:38 ` Greg Kroah-Hartman
  2017-02-02 18:38 ` [PATCH 4.9 49/51] xfs: extsize hints are not unlikely in xfs_bmap_btalloc Greg Kroah-Hartman
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Eric Sandeen,
	Christoph Hellwig

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------


From: "Darrick J. Wong" <darrick.wong@oracle.com>

commit 83d230eb5c638949350f4761acdfc0af5cb1bc00 upstream.

sb_dirblklog is added to sb_blocklog to compute the directory block size
in bytes.  Therefore, we must compare the sum of both those values
against XFS_MAX_BLOCKSIZE_LOG, not just dirblklog.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_sb.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -242,7 +242,7 @@ xfs_mount_validate_sb(
 	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
 	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
 	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
-	    sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
 	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
 	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
 	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 49/51] xfs: extsize hints are not unlikely in xfs_bmap_btalloc
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (6 preceding siblings ...)
  2017-02-02 18:38 ` [PATCH 4.9 47/51] xfs: verify dirblocklog correctly Greg Kroah-Hartman
@ 2017-02-02 18:38 ` Greg Kroah-Hartman
  2017-02-02 18:38 ` [PATCH 4.9 50/51] xfs: clear _XBF_PAGES from buffers when readahead page Greg Kroah-Hartman
  2017-02-02 18:38 ` [PATCH 4.9 51/51] xfs: fix bmv_count confusion w/ shared extents Greg Kroah-Hartman
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Christoph Hellwig,
	Arnd Bergmann

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <hch@lst.de>

commit 493611ebd62673f39e2f52c2561182c558a21cb6 upstream.

With COW files they are the hotpath, just like for files with the
extent size hint attribute.  We really shouldn't micro-manage anything
but failure cases with unlikely.

Additionally Arnd Bergmann recently reported that one of these two
unlikely annotations causes link failures together with an upcoming
kernel instrumentation patch, so let's get rid of it ASAP.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3720,7 +3720,7 @@ xfs_bmap_btalloc(
 		align = xfs_get_cowextsz_hint(ap->ip);
 	else if (xfs_alloc_is_userdata(ap->datatype))
 		align = xfs_get_extsz_hint(ap->ip);
-	if (unlikely(align)) {
+	if (align) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 						align, 0, ap->eof, 0, ap->conv,
 						&ap->offset, &ap->length);
@@ -3792,7 +3792,7 @@ xfs_bmap_btalloc(
 		args.minlen = ap->minlen;
 	}
 	/* apply extent size hints if obtained earlier */
-	if (unlikely(align)) {
+	if (align) {
 		args.prod = align;
 		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 50/51] xfs: clear _XBF_PAGES from buffers when readahead page
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (7 preceding siblings ...)
  2017-02-02 18:38 ` [PATCH 4.9 49/51] xfs: extsize hints are not unlikely in xfs_bmap_btalloc Greg Kroah-Hartman
@ 2017-02-02 18:38 ` Greg Kroah-Hartman
  2017-02-02 18:38 ` [PATCH 4.9 51/51] xfs: fix bmv_count confusion w/ shared extents Greg Kroah-Hartman
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Eric Sandeen

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: "Darrick J. Wong" <darrick.wong@oracle.com>

commit 2aa6ba7b5ad3189cc27f14540aa2f57f0ed8df4b upstream.

If we try to allocate memory pages to back an xfs_buf that we're trying
to read, it's possible that we'll be so short on memory that the page
allocation fails.  For a blocking read we'll just wait, but for
readahead we simply dump all the pages we've collected so far.

Unfortunately, after dumping the pages we neglect to clear the
_XBF_PAGES state, which means that the subsequent call to xfs_buf_free
thinks that b_pages still points to pages we own.  It then double-frees
the b_pages pages.

This results in screaming about negative page refcounts from the memory
manager, which xfs oughtn't be triggering.  To reproduce this case,
mount a filesystem where the size of the inodes far outweighs the
availalble memory (a ~500M inode filesystem on a VM with 300MB memory
did the trick here) and run bulkstat in parallel with other memory
eating processes to put a huge load on the system.  The "check summary"
phase of xfs_scrub also works for this purpose.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf.c |    1 +
 1 file changed, 1 insertion(+)

--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -423,6 +423,7 @@ retry:
 out_free_pages:
 	for (i = 0; i < bp->b_page_count; i++)
 		__free_page(bp->b_pages[i]);
+	bp->b_flags &= ~_XBF_PAGES;
 	return error;
 }

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4.9 51/51] xfs: fix bmv_count confusion w/ shared extents
       [not found] <20170202183345.067336143@linuxfoundation.org>
                   ` (8 preceding siblings ...)
  2017-02-02 18:38 ` [PATCH 4.9 50/51] xfs: clear _XBF_PAGES from buffers when readahead page Greg Kroah-Hartman
@ 2017-02-02 18:38 ` Greg Kroah-Hartman
  9 siblings, 0 replies; 10+ messages in thread
From: Greg Kroah-Hartman @ 2017-02-02 18:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Greg Kroah-Hartman, linux-xfs, Darrick J. Wong, Eric Sandeen

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------


From: "Darrick J. Wong" <darrick.wong@oracle.com>

commit c364b6d0b6cda1cd5d9ab689489adda3e82529aa upstream.

In a bmapx call, bmv_count is the total size of the array, including the
zeroth element that userspace uses to supply the search key.  The output
array starts at offset 1 so that we can set up the user for the next
invocation.  Since we now can split an extent into multiple bmap records
due to shared/unshared status, we have to be careful that we don't
overflow the output array.

In the original patch f86f403794b ("xfs: teach get_bmapx about shared
extents and the CoW fork") I used cur_ext (the output index) to check
for overflows, albeit with an off-by-one error.  Since nexleft no longer
describes the number of unfilled slots in the output, we can rip all
that out and use cur_ext for the overflow check directly.

Failure to do this causes heap corruption in bmapx callers such as
xfs_io and xfs_scrub.  xfs/328 can reproduce this problem.

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_bmap_util.c |   28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -528,7 +528,6 @@ xfs_getbmap(
 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
 	xfs_mount_t		*mp;		/* file system mount point */
 	int			nex;		/* # of user extents can do */
-	int			nexleft;	/* # of user extents left */
 	int			subnex;		/* # of bmapi's can do */
 	int			nmap;		/* number of map entries */
 	struct getbmapx		*out;		/* output structure */
@@ -686,10 +685,8 @@ xfs_getbmap(
 		goto out_free_map;
 	}
 
-	nexleft = nex;
-
 	do {
-		nmap = (nexleft > subnex) ? subnex : nexleft;
+		nmap = (nex> subnex) ? subnex : nex;
 		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
 				       map, &nmap, bmapi_flags);
@@ -697,8 +694,8 @@ xfs_getbmap(
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
 
-		for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
-				cur_ext < bmv->bmv_count; i++) {
+		for (i = 0; i < nmap && bmv->bmv_length &&
+				cur_ext < bmv->bmv_count - 1; i++) {
 			out[cur_ext].bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
 				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@@ -760,16 +757,27 @@ xfs_getbmap(
 				continue;
 			}
 
+			/*
+			 * In order to report shared extents accurately,
+			 * we report each distinct shared/unshared part
+			 * of a single bmbt record using multiple bmap
+			 * extents.  To make that happen, we iterate the
+			 * same map array item multiple times, each
+			 * time trimming out the subextent that we just
+			 * reported.
+			 *
+			 * Because of this, we must check the out array
+			 * index (cur_ext) directly against bmv_count-1
+			 * to avoid overflows.
+			 */
 			if (inject_map.br_startblock != NULLFSBLOCK) {
 				map[i] = inject_map;
 				i--;
-			} else
-				nexleft--;
+			}
 			bmv->bmv_entries++;
 			cur_ext++;
 		}
-	} while (nmap && nexleft && bmv->bmv_length &&
-		 cur_ext < bmv->bmv_count);
+	} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
 
  out_free_map:
 	kmem_free(map);



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-02-02 18:40 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20170202183345.067336143@linuxfoundation.org>
2017-02-02 18:37 ` [PATCH 4.9 33/51] xfs: bump up reserved blocks in xfs_alloc_set_aside Greg Kroah-Hartman
2017-02-02 18:37 ` [PATCH 4.9 34/51] xfs: fix bogus minleft manipulations Greg Kroah-Hartman
2017-02-02 18:37 ` [PATCH 4.9 35/51] xfs: adjust allocation length in xfs_alloc_space_available Greg Kroah-Hartman
2017-02-02 18:37 ` [PATCH 4.9 36/51] xfs: dont rely on ->total " Greg Kroah-Hartman
2017-02-02 18:37 ` [PATCH 4.9 37/51] xfs: dont print warnings when xfs_log_force fails Greg Kroah-Hartman
2017-02-02 18:38 ` [PATCH 4.9 46/51] xfs: fix COW writeback race Greg Kroah-Hartman
2017-02-02 18:38 ` [PATCH 4.9 47/51] xfs: verify dirblocklog correctly Greg Kroah-Hartman
2017-02-02 18:38 ` [PATCH 4.9 49/51] xfs: extsize hints are not unlikely in xfs_bmap_btalloc Greg Kroah-Hartman
2017-02-02 18:38 ` [PATCH 4.9 50/51] xfs: clear _XBF_PAGES from buffers when readahead page Greg Kroah-Hartman
2017-02-02 18:38 ` [PATCH 4.9 51/51] xfs: fix bmv_count confusion w/ shared extents Greg Kroah-Hartman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).