[PATCH] xfs: compute the maximum repair reaping defer intent chain length

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] xfs: compute the maximum repair reaping defer intent chain length
@ 2025-04-03 19:12 Darrick J. Wong
  2025-04-04  9:16 ` John Garry
  2025-04-08 22:14 ` Dave Chinner
  0 siblings, 2 replies; 7+ messages in thread
From: Darrick J. Wong @ 2025-04-03 19:12 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: xfs, John Garry

From: Darrick J. Wong <djwong@kernel.org>

Actually compute the log overhead of log intent items used in reap
operations and use that to compute the thresholds in reap.c instead of
assuming 2048 works.  Note that there have been no complaints because
tr_itruncate has a very large logres.

Cc: <stable@vger.kernel.org> # v6.6
Fixes: 1c7ce115e52106 ("xfs: reap large AG metadata extents when possible")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/scrub/trace.h       |   29 ++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_item.h     |    3 +++
 fs/xfs/xfs_extfree_item.h  |    3 +++
 fs/xfs/xfs_log_priv.h      |   13 +++++++++++
 fs/xfs/xfs_refcount_item.h |    3 +++
 fs/xfs/xfs_rmap_item.h     |    3 +++
 fs/xfs/scrub/reap.c        |   50 +++++++++++++++++++++++++++++++++++++++-----
 fs/xfs/scrub/trace.c       |    1 +
 fs/xfs/xfs_bmap_item.c     |   10 +++++++++
 fs/xfs/xfs_extfree_item.c  |   10 +++++++++
 fs/xfs/xfs_log_cil.c       |    4 +---
 fs/xfs/xfs_refcount_item.c |   10 +++++++++
 fs/xfs/xfs_rmap_item.c     |   10 +++++++++
 13 files changed, 140 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index d7c4ced47c1567..172765967aaab4 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2000,6 +2000,35 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
 
+DECLARE_EVENT_CLASS(xrep_reap_max_deferred_reaps_class,
+	TP_PROTO(const struct xfs_trans *tp, unsigned int per_intent_size,
+		 unsigned int max_deferred_reaps),
+	TP_ARGS(tp, per_intent_size, max_deferred_reaps),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, log_res)
+		__field(unsigned int, per_intent_size)
+		__field(unsigned int, max_deferred_reaps)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->log_res = tp->t_log_res;
+		__entry->per_intent_size = per_intent_size;
+		__entry->max_deferred_reaps = max_deferred_reaps;
+	),
+	TP_printk("dev %d:%d logres %u per_intent_size %u max_deferred_reaps %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->log_res,
+		  __entry->per_intent_size,
+		  __entry->max_deferred_reaps)
+);
+#define DEFINE_REPAIR_REAP_MAX_DEFER_CHAIN_EVENT(name) \
+DEFINE_EVENT(xrep_reap_max_deferred_reaps_class, name, \
+	TP_PROTO(const struct xfs_trans *tp, unsigned int per_intent_size, \
+		 unsigned int max_deferred_reaps), \
+	TP_ARGS(tp, per_intent_size, max_deferred_reaps))
+DEFINE_REPAIR_REAP_MAX_DEFER_CHAIN_EVENT(xreap_agextent_max_deferred_reaps);
+
 DECLARE_EVENT_CLASS(xrep_reap_find_class,
 	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
 		 xfs_extlen_t len, bool crosslinked),
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 6fee6a5083436b..72512fc700e21a 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
 
 void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
 
+unsigned int xfs_bui_item_overhead(unsigned int nr);
+unsigned int xfs_bud_item_overhead(unsigned int nr);
+
 #endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 41b7c43060799b..ebb237a4ae87b4 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
 		struct xfs_extent_free_item *xefi,
 		struct xfs_defer_pending **dfpp);
 
+unsigned int xfs_efi_item_overhead(unsigned int nr);
+unsigned int xfs_efd_item_overhead(unsigned int nr);
+
 #endif	/* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d78869e5e5a3..39a102cc1b43e6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -698,4 +698,17 @@ xlog_kvmalloc(
 	return p;
 }
 
+/*
+ * Given a count of iovecs and space for a log item, compute the space we need
+ * in the log to store that data plus the log headers.
+ */
+static inline unsigned int
+xlog_item_space(
+	unsigned int	niovecs,
+	unsigned int	nbytes)
+{
+	nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
+	return round_up(nbytes, sizeof(uint64_t));
+}
+
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index bfee8f30c63ce9..e23e768e031e20 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
 void xfs_refcount_defer_add(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri);
 
+unsigned int xfs_cui_item_overhead(unsigned int nr);
+unsigned int xfs_cud_item_overhead(unsigned int nr);
+
 #endif	/* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 40d331555675ba..5fed8864bc32cc 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
 
 void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
 
+unsigned int xfs_rui_item_overhead(unsigned int nr);
+unsigned int xfs_rud_item_overhead(unsigned int nr);
+
 #endif	/* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index b32fb233cf8476..2fd9b7465b5ed2 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -36,6 +36,9 @@
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -106,6 +109,9 @@ struct xreap_state {
 
 	/* Number of deferred reaps queued during the whole reap sequence. */
 	unsigned long long		total_deferred;
+
+	/* Maximum number of intents we can reap in a single transaction. */
+	unsigned int			max_deferred_reaps;
 };
 
 /* Put a block back on the AGFL. */
@@ -165,8 +171,8 @@ static inline bool xreap_dirty(const struct xreap_state *rs)
 
 /*
  * Decide if we want to roll the transaction after reaping an extent.  We don't
- * want to overrun the transaction reservation, so we prohibit more than
- * 128 EFIs per transaction.  For the same reason, we limit the number
+ * want to overrun the transaction reservation, so we restrict the number of
+ * log intent reaps per transaction.  For the same reason, we limit the number
  * of buffer invalidations to 2048.
  */
 static inline bool xreap_want_roll(const struct xreap_state *rs)
@@ -188,13 +194,11 @@ static inline void xreap_reset(struct xreap_state *rs)
 	rs->force_roll = false;
 }
 
-#define XREAP_MAX_DEFER_CHAIN		(2048)
-
 /*
  * Decide if we want to finish the deferred ops that are attached to the scrub
  * transaction.  We don't want to queue huge chains of deferred ops because
  * that can consume a lot of log space and kernel memory.  Hence we trigger a
- * xfs_defer_finish if there are more than 2048 deferred reap operations or the
+ * xfs_defer_finish if there are too many deferred reap operations or the
  * caller did some real work.
  */
 static inline bool
@@ -202,7 +206,7 @@ xreap_want_defer_finish(const struct xreap_state *rs)
 {
 	if (rs->force_roll)
 		return true;
-	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
+	if (rs->total_deferred > rs->max_deferred_reaps)
 		return true;
 	return false;
 }
@@ -495,6 +499,37 @@ xreap_agextent_iter(
 	return 0;
 }
 
+/*
+ * Compute the worst case log overhead of the intent items needed to reap a
+ * single per-AG space extent.
+ */
+STATIC unsigned int
+xreap_agextent_max_deferred_reaps(
+	struct xfs_scrub	*sc)
+{
+	const unsigned int	efi = xfs_efi_item_overhead(1);
+	const unsigned int	rui = xfs_rui_item_overhead(1);
+
+	/* unmapping crosslinked metadata blocks */
+	const unsigned int	t1 = rui;
+
+	/* freeing metadata blocks */
+	const unsigned int	t2 = rui + efi;
+
+	/* worst case of all four possible scenarios */
+	const unsigned int	per_intent = max(t1, t2);
+
+	/*
+	 * tr_itruncate has enough logres to unmap two file extents; use only
+	 * half the log reservation for intent items so there's space to do
+	 * actual work and requeue intent items.
+	 */
+	const unsigned int	ret = sc->tp->t_log_res / (2 * per_intent);
+
+	trace_xreap_agextent_max_deferred_reaps(sc->tp, per_intent, ret);
+	return max(1, ret);
+}
+
 /*
  * Break an AG metadata extent into sub-extents by fate (crosslinked, not
  * crosslinked), and dispose of each sub-extent separately.
@@ -556,6 +591,7 @@ xrep_reap_agblocks(
 		.sc			= sc,
 		.oinfo			= oinfo,
 		.resv			= type,
+		.max_deferred_reaps	= xreap_agextent_max_deferred_reaps(sc),
 	};
 	int				error;
 
@@ -668,6 +704,7 @@ xrep_reap_fsblocks(
 		.sc			= sc,
 		.oinfo			= oinfo,
 		.resv			= XFS_AG_RESV_NONE,
+		.max_deferred_reaps	= xreap_agextent_max_deferred_reaps(sc),
 	};
 	int				error;
 
@@ -922,6 +959,7 @@ xrep_reap_metadir_fsblocks(
 		.sc			= sc,
 		.oinfo			= &oinfo,
 		.resv			= XFS_AG_RESV_NONE,
+		.max_deferred_reaps	= xreap_agextent_max_deferred_reaps(sc),
 	};
 	int				error;
 
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 2450e214103fed..987313a52e6401 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,6 +22,7 @@
 #include "xfs_parent.h"
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
+#include "xfs_trans.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 3d52e9d7ad571a..586031332994ff 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -77,6 +77,11 @@ xfs_bui_item_size(
 	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
 }
 
+unsigned int xfs_bui_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
 	*nbytes += sizeof(struct xfs_bud_log_format);
 }
 
+unsigned int xfs_bud_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given bud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a25c713ff888c7..1dd7f45359e090 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,6 +82,11 @@ xfs_efi_item_size(
 	*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
 }
 
+unsigned int xfs_efi_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given efi log item. We use only 1 iovec, and we point that
@@ -253,6 +258,11 @@ xfs_efd_item_size(
 	*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
 }
 
+unsigned int xfs_efd_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given efd log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1ca406ec1b40b3..f66d2d430e4f37 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
 		 * Then round nbytes up to 64-bit alignment so that the initial
 		 * buffer alignment is easy to calculate and verify.
 		 */
-		nbytes += niovecs *
-			(sizeof(uint64_t) + sizeof(struct xlog_op_header));
-		nbytes = round_up(nbytes, sizeof(uint64_t));
+		nbytes = xlog_item_space(niovecs, nbytes);
 
 		/*
 		 * The data buffer needs to start 64-bit aligned, so round up
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe2d7aab8554fc..7ea43d35b1380d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -78,6 +78,11 @@ xfs_cui_item_size(
 	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
 }
 
+unsigned int xfs_cui_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
 	*nbytes += sizeof(struct xfs_cud_log_format);
 }
 
+unsigned int xfs_cud_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given cud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 89decffe76c8b5..3e214ce2339f54 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -77,6 +77,11 @@ xfs_rui_item_size(
 	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
 }
 
+unsigned int xfs_rui_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
 	*nbytes += sizeof(struct xfs_rud_log_format);
 }
 
+unsigned int xfs_rud_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given rud log item. We use only 1 iovec, and we point that

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: compute the maximum repair reaping defer intent chain length
  2025-04-03 19:12 [PATCH] xfs: compute the maximum repair reaping defer intent chain length Darrick J. Wong
@ 2025-04-04  9:16 ` John Garry
  2025-04-04 16:09   ` Darrick J. Wong
  2025-04-08 22:14 ` Dave Chinner
  1 sibling, 1 reply; 7+ messages in thread
From: John Garry @ 2025-04-04  9:16 UTC (permalink / raw)
  To: Darrick J. Wong, Carlos Maiolino; +Cc: xfs

On 03/04/2025 20:12, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Actually compute the log overhead of log intent items used in reap
> operations and use that to compute the thresholds in reap.c instead of
> assuming 2048 works.  Note that there have been no complaints because
> tr_itruncate has a very large logres.
> 

Thanks for this, but I have comments at the bottom

> Cc: <stable@vger.kernel.org> # v6.6
> Fixes: 1c7ce115e52106 ("xfs: reap large AG metadata extents when possible")
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>   fs/xfs/scrub/trace.h       |   29 ++++++++++++++++++++++++++
>   fs/xfs/xfs_bmap_item.h     |    3 +++
>   fs/xfs/xfs_extfree_item.h  |    3 +++
>   fs/xfs/xfs_log_priv.h      |   13 +++++++++++
>   fs/xfs/xfs_refcount_item.h |    3 +++
>   fs/xfs/xfs_rmap_item.h     |    3 +++
>   fs/xfs/scrub/reap.c        |   50 +++++++++++++++++++++++++++++++++++++++-----
>   fs/xfs/scrub/trace.c       |    1 +
>   fs/xfs/xfs_bmap_item.c     |   10 +++++++++
>   fs/xfs/xfs_extfree_item.c  |   10 +++++++++
>   fs/xfs/xfs_log_cil.c       |    4 +---
>   fs/xfs/xfs_refcount_item.c |   10 +++++++++
>   fs/xfs/xfs_rmap_item.c     |   10 +++++++++
>   13 files changed, 140 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> index d7c4ced47c1567..172765967aaab4 100644
> --- a/fs/xfs/scrub/trace.h
> +++ b/fs/xfs/scrub/trace.h
> @@ -2000,6 +2000,35 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
>   DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
>   DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
>   
> +DECLARE_EVENT_CLASS(xrep_reap_max_deferred_reaps_class,
> +	TP_PROTO(const struct xfs_trans *tp, unsigned int per_intent_size,
> +		 unsigned int max_deferred_reaps),
> +	TP_ARGS(tp, per_intent_size, max_deferred_reaps),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(unsigned int, log_res)
> +		__field(unsigned int, per_intent_size)
> +		__field(unsigned int, max_deferred_reaps)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = tp->t_mountp->m_super->s_dev;
> +		__entry->log_res = tp->t_log_res;
> +		__entry->per_intent_size = per_intent_size;
> +		__entry->max_deferred_reaps = max_deferred_reaps;
> +	),
> +	TP_printk("dev %d:%d logres %u per_intent_size %u max_deferred_reaps %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->log_res,
> +		  __entry->per_intent_size,
> +		  __entry->max_deferred_reaps)
> +);
> +#define DEFINE_REPAIR_REAP_MAX_DEFER_CHAIN_EVENT(name) \
> +DEFINE_EVENT(xrep_reap_max_deferred_reaps_class, name, \
> +	TP_PROTO(const struct xfs_trans *tp, unsigned int per_intent_size, \
> +		 unsigned int max_deferred_reaps), \
> +	TP_ARGS(tp, per_intent_size, max_deferred_reaps))
> +DEFINE_REPAIR_REAP_MAX_DEFER_CHAIN_EVENT(xreap_agextent_max_deferred_reaps);
> +
>   DECLARE_EVENT_CLASS(xrep_reap_find_class,
>   	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
>   		 xfs_extlen_t len, bool crosslinked),
> diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
> index 6fee6a5083436b..72512fc700e21a 100644
> --- a/fs/xfs/xfs_bmap_item.h
> +++ b/fs/xfs/xfs_bmap_item.h
> @@ -72,4 +72,7 @@ struct xfs_bmap_intent;
>   
>   void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
>   
> +unsigned int xfs_bui_item_overhead(unsigned int nr);
> +unsigned int xfs_bud_item_overhead(unsigned int nr);
> +
>   #endif	/* __XFS_BMAP_ITEM_H__ */
> diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
> index 41b7c43060799b..ebb237a4ae87b4 100644
> --- a/fs/xfs/xfs_extfree_item.h
> +++ b/fs/xfs/xfs_extfree_item.h
> @@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
>   		struct xfs_extent_free_item *xefi,
>   		struct xfs_defer_pending **dfpp);
>   
> +unsigned int xfs_efi_item_overhead(unsigned int nr);
> +unsigned int xfs_efd_item_overhead(unsigned int nr);
> +
>   #endif	/* __XFS_EXTFREE_ITEM_H__ */
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index f3d78869e5e5a3..39a102cc1b43e6 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -698,4 +698,17 @@ xlog_kvmalloc(
>   	return p;
>   }
>   
> +/*
> + * Given a count of iovecs and space for a log item, compute the space we need
> + * in the log to store that data plus the log headers.
> + */
> +static inline unsigned int
> +xlog_item_space(
> +	unsigned int	niovecs,
> +	unsigned int	nbytes)
> +{
> +	nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
> +	return round_up(nbytes, sizeof(uint64_t));
> +}
> +
>   #endif	/* __XFS_LOG_PRIV_H__ */
> diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
> index bfee8f30c63ce9..e23e768e031e20 100644
> --- a/fs/xfs/xfs_refcount_item.h
> +++ b/fs/xfs/xfs_refcount_item.h
> @@ -76,4 +76,7 @@ struct xfs_refcount_intent;
>   void xfs_refcount_defer_add(struct xfs_trans *tp,
>   		struct xfs_refcount_intent *ri);
>   
> +unsigned int xfs_cui_item_overhead(unsigned int nr);
> +unsigned int xfs_cud_item_overhead(unsigned int nr);
> +
>   #endif	/* __XFS_REFCOUNT_ITEM_H__ */
> diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
> index 40d331555675ba..5fed8864bc32cc 100644
> --- a/fs/xfs/xfs_rmap_item.h
> +++ b/fs/xfs/xfs_rmap_item.h
> @@ -75,4 +75,7 @@ struct xfs_rmap_intent;
>   
>   void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
>   
> +unsigned int xfs_rui_item_overhead(unsigned int nr);
> +unsigned int xfs_rud_item_overhead(unsigned int nr);
> +
>   #endif	/* __XFS_RMAP_ITEM_H__ */
> diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
> index b32fb233cf8476..2fd9b7465b5ed2 100644
> --- a/fs/xfs/scrub/reap.c
> +++ b/fs/xfs/scrub/reap.c
> @@ -36,6 +36,9 @@
>   #include "xfs_metafile.h"
>   #include "xfs_rtgroup.h"
>   #include "xfs_rtrmap_btree.h"
> +#include "xfs_extfree_item.h"
> +#include "xfs_rmap_item.h"
> +#include "xfs_refcount_item.h"
>   #include "scrub/scrub.h"
>   #include "scrub/common.h"
>   #include "scrub/trace.h"
> @@ -106,6 +109,9 @@ struct xreap_state {
>   
>   	/* Number of deferred reaps queued during the whole reap sequence. */
>   	unsigned long long		total_deferred;
> +
> +	/* Maximum number of intents we can reap in a single transaction. */
> +	unsigned int			max_deferred_reaps;
>   };
>   
>   /* Put a block back on the AGFL. */
> @@ -165,8 +171,8 @@ static inline bool xreap_dirty(const struct xreap_state *rs)
>   
>   /*
>    * Decide if we want to roll the transaction after reaping an extent.  We don't
> - * want to overrun the transaction reservation, so we prohibit more than
> - * 128 EFIs per transaction.  For the same reason, we limit the number
> + * want to overrun the transaction reservation, so we restrict the number of
> + * log intent reaps per transaction.  For the same reason, we limit the number
>    * of buffer invalidations to 2048.
>    */
>   static inline bool xreap_want_roll(const struct xreap_state *rs)
> @@ -188,13 +194,11 @@ static inline void xreap_reset(struct xreap_state *rs)
>   	rs->force_roll = false;
>   }
>   
> -#define XREAP_MAX_DEFER_CHAIN		(2048)
> -
>   /*
>    * Decide if we want to finish the deferred ops that are attached to the scrub
>    * transaction.  We don't want to queue huge chains of deferred ops because
>    * that can consume a lot of log space and kernel memory.  Hence we trigger a
> - * xfs_defer_finish if there are more than 2048 deferred reap operations or the
> + * xfs_defer_finish if there are too many deferred reap operations or the
>    * caller did some real work.
>    */
>   static inline bool
> @@ -202,7 +206,7 @@ xreap_want_defer_finish(const struct xreap_state *rs)
>   {
>   	if (rs->force_roll)
>   		return true;
> -	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
> +	if (rs->total_deferred > rs->max_deferred_reaps)
>   		return true;
>   	return false;
>   }
> @@ -495,6 +499,37 @@ xreap_agextent_iter(
>   	return 0;
>   }
>   
> +/*
> + * Compute the worst case log overhead of the intent items needed to reap a
> + * single per-AG space extent.
> + */
> +STATIC unsigned int
> +xreap_agextent_max_deferred_reaps(
> +	struct xfs_scrub	*sc)
> +{
> +	const unsigned int	efi = xfs_efi_item_overhead(1);
> +	const unsigned int	rui = xfs_rui_item_overhead(1);
> +
> +	/* unmapping crosslinked metadata blocks */
> +	const unsigned int	t1 = rui;
> +
> +	/* freeing metadata blocks */
> +	const unsigned int	t2 = rui + efi;
> +
> +	/* worst case of all four possible scenarios */
> +	const unsigned int	per_intent = max(t1, t2);
> +
> +	/*
> +	 * tr_itruncate has enough logres to unmap two file extents; use only
> +	 * half the log reservation for intent items so there's space to do
> +	 * actual work and requeue intent items.
> +	 */
> +	const unsigned int	ret = sc->tp->t_log_res / (2 * per_intent);
> +
> +	trace_xreap_agextent_max_deferred_reaps(sc->tp, per_intent, ret);
> +	return max(1, ret);
> +}
> +
>   /*
>    * Break an AG metadata extent into sub-extents by fate (crosslinked, not
>    * crosslinked), and dispose of each sub-extent separately.
> @@ -556,6 +591,7 @@ xrep_reap_agblocks(
>   		.sc			= sc,
>   		.oinfo			= oinfo,
>   		.resv			= type,
> +		.max_deferred_reaps	= xreap_agextent_max_deferred_reaps(sc),
>   	};
>   	int				error;
>   
> @@ -668,6 +704,7 @@ xrep_reap_fsblocks(
>   		.sc			= sc,
>   		.oinfo			= oinfo,
>   		.resv			= XFS_AG_RESV_NONE,
> +		.max_deferred_reaps	= xreap_agextent_max_deferred_reaps(sc),
>   	};
>   	int				error;
>   
> @@ -922,6 +959,7 @@ xrep_reap_metadir_fsblocks(
>   		.sc			= sc,
>   		.oinfo			= &oinfo,
>   		.resv			= XFS_AG_RESV_NONE,
> +		.max_deferred_reaps	= xreap_agextent_max_deferred_reaps(sc),
>   	};
>   	int				error;
>   
> diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
> index 2450e214103fed..987313a52e6401 100644
> --- a/fs/xfs/scrub/trace.c
> +++ b/fs/xfs/scrub/trace.c
> @@ -22,6 +22,7 @@
>   #include "xfs_parent.h"
>   #include "xfs_metafile.h"
>   #include "xfs_rtgroup.h"
> +#include "xfs_trans.h"
>   #include "scrub/scrub.h"
>   #include "scrub/xfile.h"
>   #include "scrub/xfarray.h"
> diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
> index 3d52e9d7ad571a..586031332994ff 100644
> --- a/fs/xfs/xfs_bmap_item.c
> +++ b/fs/xfs/xfs_bmap_item.c
> @@ -77,6 +77,11 @@ xfs_bui_item_size(
>   	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
>   }
>   
> +unsigned int xfs_bui_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given bui log item. We use only 1 iovec, and we point that
> @@ -168,6 +173,11 @@ xfs_bud_item_size(
>   	*nbytes += sizeof(struct xfs_bud_log_format);
>   }
>   
> +unsigned int xfs_bud_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given bud log item. We use only 1 iovec, and we point that
> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> index a25c713ff888c7..1dd7f45359e090 100644
> --- a/fs/xfs/xfs_extfree_item.c
> +++ b/fs/xfs/xfs_extfree_item.c
> @@ -82,6 +82,11 @@ xfs_efi_item_size(
>   	*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
>   }
>   
> +unsigned int xfs_efi_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given efi log item. We use only 1 iovec, and we point that
> @@ -253,6 +258,11 @@ xfs_efd_item_size(
>   	*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
>   }
>   
> +unsigned int xfs_efd_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given efd log item. We use only 1 iovec, and we point that
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index 1ca406ec1b40b3..f66d2d430e4f37 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
>   		 * Then round nbytes up to 64-bit alignment so that the initial
>   		 * buffer alignment is easy to calculate and verify.
>   		 */
> -		nbytes += niovecs *
> -			(sizeof(uint64_t) + sizeof(struct xlog_op_header));
> -		nbytes = round_up(nbytes, sizeof(uint64_t));
> +		nbytes = xlog_item_space(niovecs, nbytes);
>   
>   		/*
>   		 * The data buffer needs to start 64-bit aligned, so round up
> diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
> index fe2d7aab8554fc..7ea43d35b1380d 100644
> --- a/fs/xfs/xfs_refcount_item.c
> +++ b/fs/xfs/xfs_refcount_item.c
> @@ -78,6 +78,11 @@ xfs_cui_item_size(
>   	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
>   }
>   
> +unsigned int xfs_cui_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given cui log item. We use only 1 iovec, and we point that
> @@ -179,6 +184,11 @@ xfs_cud_item_size(
>   	*nbytes += sizeof(struct xfs_cud_log_format);
>   }
>   
> +unsigned int xfs_cud_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given cud log item. We use only 1 iovec, and we point that
> diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
> index 89decffe76c8b5..3e214ce2339f54 100644
> --- a/fs/xfs/xfs_rmap_item.c
> +++ b/fs/xfs/xfs_rmap_item.c
> @@ -77,6 +77,11 @@ xfs_rui_item_size(
>   	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
>   }
>   
> +unsigned int xfs_rui_item_overhead(unsigned int nr)
> +{
> +	return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
> +}
> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given rui log item. We use only 1 iovec, and we point that
> @@ -180,6 +185,11 @@ xfs_rud_item_size(
>   	*nbytes += sizeof(struct xfs_rud_log_format);
>   }
>   
> +unsigned int xfs_rud_item_overhead(unsigned int nr)

I guess that it is intentional, but nr is not used

> +{
> +	return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
> +}

I just noticed that this function - in addition to 
xfs_cud_item_overhead() and xfs_cui_item_overhead() - are not referenced 
in this patch, but only in the rest of the internal series which this is 
taken from.

> +
>   /*
>    * This is called to fill in the vector of log iovecs for the
>    * given rud log item. We use only 1 iovec, and we point that


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: compute the maximum repair reaping defer intent chain length
  2025-04-04  9:16 ` John Garry
@ 2025-04-04 16:09   ` Darrick J. Wong
  2025-04-04 16:36     ` John Garry
  0 siblings, 1 reply; 7+ messages in thread
From: Darrick J. Wong @ 2025-04-04 16:09 UTC (permalink / raw)
  To: John Garry; +Cc: Carlos Maiolino, xfs

On Fri, Apr 04, 2025 at 10:16:39AM +0100, John Garry wrote:
> On 03/04/2025 20:12, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Actually compute the log overhead of log intent items used in reap
> > operations and use that to compute the thresholds in reap.c instead of
> > assuming 2048 works.  Note that there have been no complaints because
> > tr_itruncate has a very large logres.
> > 
> 
> Thanks for this, but I have comments at the bottom

<snip>

> > diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
> > index 89decffe76c8b5..3e214ce2339f54 100644
> > --- a/fs/xfs/xfs_rmap_item.c
> > +++ b/fs/xfs/xfs_rmap_item.c
> > @@ -77,6 +77,11 @@ xfs_rui_item_size(
> >   	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
> >   }
> > +unsigned int xfs_rui_item_overhead(unsigned int nr)
> > +{
> > +	return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
> > +}
> > +
> >   /*
> >    * This is called to fill in the vector of log iovecs for the
> >    * given rui log item. We use only 1 iovec, and we point that
> > @@ -180,6 +185,11 @@ xfs_rud_item_size(
> >   	*nbytes += sizeof(struct xfs_rud_log_format);
> >   }
> > +unsigned int xfs_rud_item_overhead(unsigned int nr)
> 
> I guess that it is intentional, but nr is not used

Eh, yeah, I suppose these parameters aren't necessary.

> > +{
> > +	return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
> > +}
> 
> I just noticed that this function - in addition to xfs_cud_item_overhead()

Hmmm.  Scrub uses tr_itruncate for transactions.  For reaping, it allows
up to half the reservation for intent items, and the other half to make
progress on one of those intent items.  So if we start by attaching this
to the first reap transaction:

RUI 0
EFI 0
...
RUI X
EFI X

Then on the next ->finish_one call, we'll finish RUI 0's deferred work.
In the worst case all the items need relogging, so the second reap
transaction looks like this:

RUD 0
EFD 0 + EFI 0'
...
RUD X + RUI X'
EFD X + EFD X'
<pile of rmap btree buffers>

So I guess the computation /does/ need to account for RUDs, so the code
at the top of xreap_agextent_max_deferred_reaps should be:

	const unsigned int	efi = xfs_efi_item_overhead(1) +
				      xfs_efd_item_overhead(1);
	const unsigned int	rui = xfs_rui_item_overhead(1) +
				      xfs_rud_item_overhead(1);

Thanks for pointing that out.

Thinking about this further, reaping doesn't touch the bmap and it only
processes a single logical change to a single extent.  So we don't need
to save half of the tr_itruncate reservation for the actual btree
updates; that could instead be:

	/*
	 * agf, agfl, and superblock for the freed extent
	 * worst case split in allocation btrees for freeing 1 extent
	 */
	upd = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
	      xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);

	ret = (sc->tp->t_log_res - upd) / per_intent;

Note that xfs_allocfree_block_count allows for two full rmap btree
splits already, so upd covers the btree buffer updates for the RUI case.
I would conservatively double upd because rolling to accomodate more
intent items is better than overrunning the reservation.

> and xfs_cui_item_overhead() - are not referenced in this patch, but only in

The refcount intent items aren't needed for online fsck because xreap_*
doesn't mess with file data.  They're provided entirely for the sake of
cow fallback of multi-fsblock untorn writes.  IOWs, it's to reduce churn
between our patchsets (really, this patch and your patchset) assuming
that part of untorn writes actually goes into 6.16.

> the rest of the internal series which this is taken from.

I wish you wouldn't mention internal patchsets on public lists.

For everyone else who just saw this -- this used to be patch 2 of a
3-patch series that I sent John to support his work on the cow fallback
for multi-fsblock untorn writes.  The first patch was buggy so I threw
it away, and the third patch wasn't really needed but I didn't figure
that out until the second re-read of it.  This is the only remaining
patch.

--D

> > +
> >   /*
> >    * This is called to fill in the vector of log iovecs for the
> >    * given rud log item. We use only 1 iovec, and we point that
> 
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: compute the maximum repair reaping defer intent chain length
  2025-04-04 16:09   ` Darrick J. Wong
@ 2025-04-04 16:36     ` John Garry
  2025-04-06 17:22       ` Darrick J. Wong
  0 siblings, 1 reply; 7+ messages in thread
From: John Garry @ 2025-04-04 16:36 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Carlos Maiolino, xfs

On 04/04/2025 17:09, Darrick J. Wong wrote:
>> and xfs_cui_item_overhead() - are not referenced in this patch, but only in
> The refcount intent items aren't needed for online fsck because xreap_*
> doesn't mess with file data.  They're provided entirely for the sake of
> cow fallback of multi-fsblock untorn writes.  IOWs, it's to reduce churn
> between our patchsets (really, this patch and your patchset) assuming
> that part of untorn writes actually goes into 6.16.

Can you please advise on how you would like to proceed this patch and my 
dependent work?

Thanks,
John

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: compute the maximum repair reaping defer intent chain length
  2025-04-04 16:36     ` John Garry
@ 2025-04-06 17:22       ` Darrick J. Wong
  0 siblings, 0 replies; 7+ messages in thread
From: Darrick J. Wong @ 2025-04-06 17:22 UTC (permalink / raw)
  To: John Garry; +Cc: Carlos Maiolino, xfs

On Fri, Apr 04, 2025 at 05:36:54PM +0100, John Garry wrote:
> On 04/04/2025 17:09, Darrick J. Wong wrote:
> > > and xfs_cui_item_overhead() - are not referenced in this patch, but only in
> > The refcount intent items aren't needed for online fsck because xreap_*
> > doesn't mess with file data.  They're provided entirely for the sake of
> > cow fallback of multi-fsblock untorn writes.  IOWs, it's to reduce churn
> > between our patchsets (really, this patch and your patchset) assuming
> > that part of untorn writes actually goes into 6.16.
> 
> Can you please advise on how you would like to proceed this patch and my
> dependent work?

Cut out whatever pieces you need from this patch below.

My guess is that you'll need to compute tr_logres as:

tr_logres = max(logres to do one step of a bui +
                logres to do one step of a cui +
                logres to do one step of a efi +
                logres to do one step of a rui) +
            max(logres to log one bui and bud +
                logres to log one cui and cud +
                logres to log one efi and efd +
                logres to log one rui and rud) * untorn_write_unit_max;

(or transform this formula to compute untorn_write_unit_max from
tr_logres)

--D

From: Darrick J. Wong <djwong@kernel.org>
Subject: [PATCH] xfs: add helpers to compute log item overhead

Add selected helpers to estimate the transaction reservation required to
write various log intent and buffer items to the log.  These helpers
will be used by the online repair code for more precise estimations of
how much work can be done in a single transaction.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_trans_resv.h |    4 ++++
 fs/xfs/xfs_bmap_item.h         |    3 +++
 fs/xfs/xfs_buf_item.h          |    3 +++
 fs/xfs/xfs_extfree_item.h      |    3 +++
 fs/xfs/xfs_log_priv.h          |   13 +++++++++++++
 fs/xfs/xfs_refcount_item.h     |    3 +++
 fs/xfs/xfs_rmap_item.h         |    3 +++
 fs/xfs/libxfs/xfs_trans_resv.c |    6 +++---
 fs/xfs/xfs_bmap_item.c         |   10 ++++++++++
 fs/xfs/xfs_buf_item.c          |   19 +++++++++++++++++++
 fs/xfs/xfs_extfree_item.c      |   10 ++++++++++
 fs/xfs/xfs_log_cil.c           |    4 +---
 fs/xfs/xfs_refcount_item.c     |   10 ++++++++++
 fs/xfs/xfs_rmap_item.c         |   10 ++++++++++
 14 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d269..e76052028cc9d4 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -97,6 +97,10 @@ struct xfs_trans_resv {
 
 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
 uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+unsigned int xfs_refcountbt_block_count(struct xfs_mount *mp,
+		unsigned int num_ops);
+uint xfs_calc_buf_res(uint nbufs, uint size);
+uint xfs_calc_inode_res(struct xfs_mount *mp, uint ninodes);
 
 unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
 unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 6fee6a5083436b..655b30bc17361e 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
 
 void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
 
+unsigned int xfs_bui_item_overhead(unsigned int nr);
+unsigned int xfs_bud_item_overhead(void);
+
 #endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 8cde85259a586d..a273f45b558da3 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
 void	xfs_buf_iodone(struct xfs_buf *);
 bool	xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
 
+unsigned int xfs_buf_inval_item_overhead(unsigned int map_count,
+		unsigned int blocksize);
+
 extern struct kmem_cache	*xfs_buf_item_cache;
 
 #endif	/* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 41b7c43060799b..ebb237a4ae87b4 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
 		struct xfs_extent_free_item *xefi,
 		struct xfs_defer_pending **dfpp);
 
+unsigned int xfs_efi_item_overhead(unsigned int nr);
+unsigned int xfs_efd_item_overhead(unsigned int nr);
+
 #endif	/* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d78869e5e5a3..39a102cc1b43e6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -698,4 +698,17 @@ xlog_kvmalloc(
 	return p;
 }
 
+/*
+ * Given a count of iovecs and space for a log item, compute the space we need
+ * in the log to store that data plus the log headers.
+ */
+static inline unsigned int
+xlog_item_space(
+	unsigned int	niovecs,
+	unsigned int	nbytes)
+{
+	nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
+	return round_up(nbytes, sizeof(uint64_t));
+}
+
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index bfee8f30c63ce9..5976cf0a04a671 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
 void xfs_refcount_defer_add(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri);
 
+unsigned int xfs_cui_item_overhead(unsigned int nr);
+unsigned int xfs_cud_item_overhead(void);
+
 #endif	/* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 40d331555675ba..0dac2cfe456749 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
 
 void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
 
+unsigned int xfs_rui_item_overhead(unsigned int nr);
+unsigned int xfs_rud_item_overhead(void);
+
 #endif	/* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 13d00c7166e178..ce1393bd3561fd 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -47,7 +47,7 @@ xfs_buf_log_overhead(void)
  * will be changed in a transaction.  size is used to tell how many
  * bytes should be reserved per item.
  */
-STATIC uint
+uint
 xfs_calc_buf_res(
 	uint		nbufs,
 	uint		size)
@@ -84,7 +84,7 @@ xfs_allocfree_block_count(
  * in the same transaction as an allocation or a free, so we compute them
  * separately.
  */
-static unsigned int
+unsigned int
 xfs_refcountbt_block_count(
 	struct xfs_mount	*mp,
 	unsigned int		num_ops)
@@ -129,7 +129,7 @@ xfs_rtrefcountbt_block_count(
  *	  additional to the records and pointers that fit inside the inode
  *	  forks.
  */
-STATIC uint
+uint
 xfs_calc_inode_res(
 	struct xfs_mount	*mp,
 	uint			ninodes)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 3d52e9d7ad571a..c62b9c1dd448b8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -77,6 +77,11 @@ xfs_bui_item_size(
 	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
 }
 
+unsigned int xfs_bui_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
 	*nbytes += sizeof(struct xfs_bud_log_format);
 }
 
+unsigned int xfs_bud_item_overhead(void)
+{
+	return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given bud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 47549cfa61cd82..503675053a228a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -167,6 +167,25 @@ xfs_buf_item_size_segment(
 	}
 }
 
+/*
+ * Compute the worst case log item overhead for an invalidated buffer with the
+ * given map count and block size.
+ */
+unsigned int
+xfs_buf_inval_item_overhead(
+	unsigned int	map_count,
+	unsigned int	blocksize)
+{
+	unsigned int	chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
+	unsigned int	bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
+	unsigned int	ret =
+		offsetof(struct xfs_buf_log_format, blf_data_map) +
+			(bitmap_size * sizeof_field(struct xfs_buf_log_format,
+						    blf_data_map[0]));
+
+	return ret * map_count;
+}
+
 /*
  * Return the number of log iovecs and space needed to log the given buf log
  * item.
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a25c713ff888c7..1dd7f45359e090 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,6 +82,11 @@ xfs_efi_item_size(
 	*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
 }
 
+unsigned int xfs_efi_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given efi log item. We use only 1 iovec, and we point that
@@ -253,6 +258,11 @@ xfs_efd_item_size(
 	*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
 }
 
+unsigned int xfs_efd_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given efd log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1ca406ec1b40b3..f66d2d430e4f37 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
 		 * Then round nbytes up to 64-bit alignment so that the initial
 		 * buffer alignment is easy to calculate and verify.
 		 */
-		nbytes += niovecs *
-			(sizeof(uint64_t) + sizeof(struct xlog_op_header));
-		nbytes = round_up(nbytes, sizeof(uint64_t));
+		nbytes = xlog_item_space(niovecs, nbytes);
 
 		/*
 		 * The data buffer needs to start 64-bit aligned, so round up
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe2d7aab8554fc..02defb7116419f 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -78,6 +78,11 @@ xfs_cui_item_size(
 	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
 }
 
+unsigned int xfs_cui_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
 	*nbytes += sizeof(struct xfs_cud_log_format);
 }
 
+unsigned int xfs_cud_item_overhead(void)
+{
+	return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given cud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 89decffe76c8b5..45230072564114 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -77,6 +77,11 @@ xfs_rui_item_size(
 	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
 }
 
+unsigned int xfs_rui_item_overhead(unsigned int nr)
+{
+	return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
 	*nbytes += sizeof(struct xfs_rud_log_format);
 }
 
+unsigned int xfs_rud_item_overhead(void)
+{
+	return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the
  * given rud log item. We use only 1 iovec, and we point that

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: compute the maximum repair reaping defer intent chain length
  2025-04-03 19:12 [PATCH] xfs: compute the maximum repair reaping defer intent chain length Darrick J. Wong
  2025-04-04  9:16 ` John Garry
@ 2025-04-08 22:14 ` Dave Chinner
  2025-04-08 23:13   ` Darrick J. Wong
  1 sibling, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2025-04-08 22:14 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Carlos Maiolino, xfs, John Garry

On Thu, Apr 03, 2025 at 12:12:44PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Actually compute the log overhead of log intent items used in reap
> operations and use that to compute the thresholds in reap.c instead of
> assuming 2048 works.  Note that there have been no complaints because
> tr_itruncate has a very large logres.
> 
> Cc: <stable@vger.kernel.org> # v6.6
> Fixes: 1c7ce115e52106 ("xfs: reap large AG metadata extents when possible")
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  fs/xfs/scrub/trace.h       |   29 ++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap_item.h     |    3 +++
>  fs/xfs/xfs_extfree_item.h  |    3 +++
>  fs/xfs/xfs_log_priv.h      |   13 +++++++++++
>  fs/xfs/xfs_refcount_item.h |    3 +++
>  fs/xfs/xfs_rmap_item.h     |    3 +++
>  fs/xfs/scrub/reap.c        |   50 +++++++++++++++++++++++++++++++++++++++-----
>  fs/xfs/scrub/trace.c       |    1 +
>  fs/xfs/xfs_bmap_item.c     |   10 +++++++++
>  fs/xfs/xfs_extfree_item.c  |   10 +++++++++
>  fs/xfs/xfs_log_cil.c       |    4 +---
>  fs/xfs/xfs_refcount_item.c |   10 +++++++++
>  fs/xfs/xfs_rmap_item.c     |   10 +++++++++
>  13 files changed, 140 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> index d7c4ced47c1567..172765967aaab4 100644
> --- a/fs/xfs/scrub/trace.h
> +++ b/fs/xfs/scrub/trace.h
> @@ -2000,6 +2000,35 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
>  DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
>  DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
>  
> +DECLARE_EVENT_CLASS(xrep_reap_max_deferred_reaps_class,
> +	TP_PROTO(const struct xfs_trans *tp, unsigned int per_intent_size,
> +		 unsigned int max_deferred_reaps),
> +	TP_ARGS(tp, per_intent_size, max_deferred_reaps),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(unsigned int, log_res)
> +		__field(unsigned int, per_intent_size)
> +		__field(unsigned int, max_deferred_reaps)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = tp->t_mountp->m_super->s_dev;
> +		__entry->log_res = tp->t_log_res;
> +		__entry->per_intent_size = per_intent_size;
> +		__entry->max_deferred_reaps = max_deferred_reaps;
> +	),
> +	TP_printk("dev %d:%d logres %u per_intent_size %u max_deferred_reaps %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->log_res,
> +		  __entry->per_intent_size,
> +		  __entry->max_deferred_reaps)
> +);
> +#define DEFINE_REPAIR_REAP_MAX_DEFER_CHAIN_EVENT(name) \
> +DEFINE_EVENT(xrep_reap_max_deferred_reaps_class, name, \
> +	TP_PROTO(const struct xfs_trans *tp, unsigned int per_intent_size, \
> +		 unsigned int max_deferred_reaps), \
> +	TP_ARGS(tp, per_intent_size, max_deferred_reaps))
> +DEFINE_REPAIR_REAP_MAX_DEFER_CHAIN_EVENT(xreap_agextent_max_deferred_reaps);
> +
>  DECLARE_EVENT_CLASS(xrep_reap_find_class,
>  	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
>  		 xfs_extlen_t len, bool crosslinked),
> diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
> index 6fee6a5083436b..72512fc700e21a 100644
> --- a/fs/xfs/xfs_bmap_item.h
> +++ b/fs/xfs/xfs_bmap_item.h
> @@ -72,4 +72,7 @@ struct xfs_bmap_intent;
>  
>  void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
>  
> +unsigned int xfs_bui_item_overhead(unsigned int nr);
> +unsigned int xfs_bud_item_overhead(unsigned int nr);
> +
>  #endif	/* __XFS_BMAP_ITEM_H__ */
> diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
> index 41b7c43060799b..ebb237a4ae87b4 100644
> --- a/fs/xfs/xfs_extfree_item.h
> +++ b/fs/xfs/xfs_extfree_item.h
> @@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
>  		struct xfs_extent_free_item *xefi,
>  		struct xfs_defer_pending **dfpp);
>  
> +unsigned int xfs_efi_item_overhead(unsigned int nr);
> +unsigned int xfs_efd_item_overhead(unsigned int nr);
> +
>  #endif	/* __XFS_EXTFREE_ITEM_H__ */
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index f3d78869e5e5a3..39a102cc1b43e6 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -698,4 +698,17 @@ xlog_kvmalloc(
>  	return p;
>  }
>  
> +/*
> + * Given a count of iovecs and space for a log item, compute the space we need
> + * in the log to store that data plus the log headers.
> + */
> +static inline unsigned int
> +xlog_item_space(
> +	unsigned int	niovecs,
> +	unsigned int	nbytes)
> +{
> +	nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
> +	return round_up(nbytes, sizeof(uint64_t));
> +}
> +
>  #endif	/* __XFS_LOG_PRIV_H__ */
> diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
> index bfee8f30c63ce9..e23e768e031e20 100644
> --- a/fs/xfs/xfs_refcount_item.h
> +++ b/fs/xfs/xfs_refcount_item.h
> @@ -76,4 +76,7 @@ struct xfs_refcount_intent;
>  void xfs_refcount_defer_add(struct xfs_trans *tp,
>  		struct xfs_refcount_intent *ri);
>  
> +unsigned int xfs_cui_item_overhead(unsigned int nr);
> +unsigned int xfs_cud_item_overhead(unsigned int nr);
> +
>  #endif	/* __XFS_REFCOUNT_ITEM_H__ */
> diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
> index 40d331555675ba..5fed8864bc32cc 100644
> --- a/fs/xfs/xfs_rmap_item.h
> +++ b/fs/xfs/xfs_rmap_item.h
> @@ -75,4 +75,7 @@ struct xfs_rmap_intent;
>  
>  void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
>  
> +unsigned int xfs_rui_item_overhead(unsigned int nr);
> +unsigned int xfs_rud_item_overhead(unsigned int nr);
> +
>  #endif	/* __XFS_RMAP_ITEM_H__ */
> diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
> index b32fb233cf8476..2fd9b7465b5ed2 100644
> --- a/fs/xfs/scrub/reap.c
> +++ b/fs/xfs/scrub/reap.c
> @@ -36,6 +36,9 @@
>  #include "xfs_metafile.h"
>  #include "xfs_rtgroup.h"
>  #include "xfs_rtrmap_btree.h"
> +#include "xfs_extfree_item.h"
> +#include "xfs_rmap_item.h"
> +#include "xfs_refcount_item.h"
>  #include "scrub/scrub.h"
>  #include "scrub/common.h"
>  #include "scrub/trace.h"
> @@ -106,6 +109,9 @@ struct xreap_state {
>  
>  	/* Number of deferred reaps queued during the whole reap sequence. */
>  	unsigned long long		total_deferred;
> +
> +	/* Maximum number of intents we can reap in a single transaction. */
> +	unsigned int			max_deferred_reaps;
>  };
>  
>  /* Put a block back on the AGFL. */
> @@ -165,8 +171,8 @@ static inline bool xreap_dirty(const struct xreap_state *rs)
>  
>  /*
>   * Decide if we want to roll the transaction after reaping an extent.  We don't
> - * want to overrun the transaction reservation, so we prohibit more than
> - * 128 EFIs per transaction.  For the same reason, we limit the number
> + * want to overrun the transaction reservation, so we restrict the number of
> + * log intent reaps per transaction.  For the same reason, we limit the number
>   * of buffer invalidations to 2048.
>   */
>  static inline bool xreap_want_roll(const struct xreap_state *rs)
> @@ -188,13 +194,11 @@ static inline void xreap_reset(struct xreap_state *rs)
>  	rs->force_roll = false;
>  }
>  
> -#define XREAP_MAX_DEFER_CHAIN		(2048)
> -
>  /*
>   * Decide if we want to finish the deferred ops that are attached to the scrub
>   * transaction.  We don't want to queue huge chains of deferred ops because
>   * that can consume a lot of log space and kernel memory.  Hence we trigger a
> - * xfs_defer_finish if there are more than 2048 deferred reap operations or the
> + * xfs_defer_finish if there are too many deferred reap operations or the
>   * caller did some real work.
>   */
>  static inline bool
> @@ -202,7 +206,7 @@ xreap_want_defer_finish(const struct xreap_state *rs)
>  {
>  	if (rs->force_roll)
>  		return true;
> -	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
> +	if (rs->total_deferred > rs->max_deferred_reaps)
>  		return true;
>  	return false;
>  }
> @@ -495,6 +499,37 @@ xreap_agextent_iter(
>  	return 0;
>  }
>  
> +/*
> + * Compute the worst case log overhead of the intent items needed to reap a
> + * single per-AG space extent.
> + */
> +STATIC unsigned int
> +xreap_agextent_max_deferred_reaps(
> +	struct xfs_scrub	*sc)
> +{
> +	const unsigned int	efi = xfs_efi_item_overhead(1);
> +	const unsigned int	rui = xfs_rui_item_overhead(1);

These wrappers don't return some "overhead" - they return the amount of
log space the intent will consume. Can we please call them
xfs_<intent_type>_log_space()?

> +
> +	/* unmapping crosslinked metadata blocks */
> +	const unsigned int	t1 = rui;
> +
> +	/* freeing metadata blocks */
> +	const unsigned int	t2 = rui + efi;
> +
> +	/* worst case of all four possible scenarios */
> +	const unsigned int	per_intent = max(t1, t2);

When is per_intent going to be different to t2? i.e. rui + efi > rui
will always be true, so we don't need to calculate it at runtime.

i.e. what "four scenarios" is this actually handling? I can't work
out what they are from the code or the description...

> +	/*
> +	 * tr_itruncate has enough logres to unmap two file extents; use only
> +	 * half the log reservation for intent items so there's space to do
> +	 * actual work and requeue intent items.
> +	 */
> +	const unsigned int	ret = sc->tp->t_log_res / (2 * per_intent);

So we are assuming only a single type of log reservation is used for
these reaping transactions?

If so, this is calculating a value that is constant for the life of
the mount and probably should be moved to all the other log
reservation calculation functions that are run at mount time and
stored with them.

i.e. this calculation is effectively:

	return (xfs_calc_itruncate_reservation(mp, false) >> 1) /
		(xfs_efi_log_space(1) + xfs_rui_log_space(1));

Also, I think that the algorithm used to calculate the number of
intents we can fit in such a transaction should be described in
the comment above the function, then implement the algorithm as
efficiently as possible (as we already do in xfs_trans_resv.c).

> +	trace_xreap_agextent_max_deferred_reaps(sc->tp, per_intent, ret);

If it is calculated at mount time, this can be emitted along with
all the other log reservations calculated at mount time.

> +	return max(1, ret);

Why? tp->t_logres should always be much greater than the intent
size. How will this ever evaluate as zero without there being some
other serious log/transaction reservation bug elsewhere in the code?

i.e. if we can get zero here, that needs ASSERTS and/or WARN_ON
output and a filesystem shutdown because there is something
seriously wrong occurring in the filesystem.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: compute the maximum repair reaping defer intent chain length
  2025-04-08 22:14 ` Dave Chinner
@ 2025-04-08 23:13   ` Darrick J. Wong
  0 siblings, 0 replies; 7+ messages in thread
From: Darrick J. Wong @ 2025-04-08 23:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Carlos Maiolino, xfs, John Garry

On Wed, Apr 09, 2025 at 08:14:31AM +1000, Dave Chinner wrote:
> On Thu, Apr 03, 2025 at 12:12:44PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Actually compute the log overhead of log intent items used in reap
> > operations and use that to compute the thresholds in reap.c instead of
> > assuming 2048 works.  Note that there have been no complaints because
> > tr_itruncate has a very large logres.
> > 
> > Cc: <stable@vger.kernel.org> # v6.6
> > Fixes: 1c7ce115e52106 ("xfs: reap large AG metadata extents when possible")
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>

/me notes this is an RFC patch and I just finished writing a more
expansive patchset that breaks up the pieces and computes limits for
each usecase separately.

> > ---
> >  fs/xfs/scrub/trace.h       |   29 ++++++++++++++++++++++++++
> >  fs/xfs/xfs_bmap_item.h     |    3 +++
> >  fs/xfs/xfs_extfree_item.h  |    3 +++
> >  fs/xfs/xfs_log_priv.h      |   13 +++++++++++
> >  fs/xfs/xfs_refcount_item.h |    3 +++
> >  fs/xfs/xfs_rmap_item.h     |    3 +++
> >  fs/xfs/scrub/reap.c        |   50 +++++++++++++++++++++++++++++++++++++++-----
> >  fs/xfs/scrub/trace.c       |    1 +
> >  fs/xfs/xfs_bmap_item.c     |   10 +++++++++
> >  fs/xfs/xfs_extfree_item.c  |   10 +++++++++
> >  fs/xfs/xfs_log_cil.c       |    4 +---
> >  fs/xfs/xfs_refcount_item.c |   10 +++++++++
> >  fs/xfs/xfs_rmap_item.c     |   10 +++++++++
> >  13 files changed, 140 insertions(+), 9 deletions(-)

<snip>

> > @@ -495,6 +499,37 @@ xreap_agextent_iter(
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Compute the worst case log overhead of the intent items needed to reap a
> > + * single per-AG space extent.
> > + */
> > +STATIC unsigned int
> > +xreap_agextent_max_deferred_reaps(
> > +	struct xfs_scrub	*sc)
> > +{
> > +	const unsigned int	efi = xfs_efi_item_overhead(1);
> > +	const unsigned int	rui = xfs_rui_item_overhead(1);
> 
> These wrappers don't return some "overhead" - they return the amount of
> log space the intent will consume. Can we please call them
> xfs_<intent_type>_log_space()?

Ok.

> > +
> > +	/* unmapping crosslinked metadata blocks */
> > +	const unsigned int	t1 = rui;
> > +
> > +	/* freeing metadata blocks */
> > +	const unsigned int	t2 = rui + efi;
> > +
> > +	/* worst case of all four possible scenarios */
> > +	const unsigned int	per_intent = max(t1, t2);
> 
> When is per_intent going to be different to t2? i.e. rui + efi > rui
> will always be true, so we don't need to calculate it at runtime.

Any decent compiler will combine this into:

	const unsigned int      per_intent = rui + efi;

> i.e. what "four scenarios" is this actually handling? I can't work
> out what they are from the code or the description...

The five things that xreap_agextent_iter() does.  One of them
(XFS_AG_RESV_AGFL) doesn't use intents so I only listed four.

> > +	/*
> > +	 * tr_itruncate has enough logres to unmap two file extents; use only
> > +	 * half the log reservation for intent items so there's space to do
> > +	 * actual work and requeue intent items.
> > +	 */
> > +	const unsigned int	ret = sc->tp->t_log_res / (2 * per_intent);
> 
> So we are assuming only a single type of log reservation is used for
> these reaping transactions?

Well it's *currently* tr_itruncate, but some day we could add a
bigger/separate repair transaction type.

> If so, this is calculating a value that is constant for the life of
> the mount and probably should be moved to all the other log
> reservation calculation functions that are run at mount time and
> stored with them.
> 
> i.e. this calculation is effectively:
> 
> 	return (xfs_calc_itruncate_reservation(mp, false) >> 1) /
> 		(xfs_efi_log_space(1) + xfs_rui_log_space(1));
> 
> Also, I think that the algorithm used to calculate the number of
> intents we can fit in such a transaction should be described in
> the comment above the function, then implement the algorithm as

I don't like  ^^^^^^^^^^^^^^^^^^ this style of putting the comment at
some distance from the actual computation code.  Look at
xfs_calc_link_reservation:

/*
 * For creating a link to an inode:
 *    the parent directory inode: inode size
 *    the linked inode: inode size
 *    the directory btree could split: (max depth + v2) * dir block size
 *    the directory bmap btree could join or split: (max depth + v2) * blocksize
 * And the bmap_finish transaction can free some bmap blocks giving:
 *    the agf for the ag in which the blocks live: sector size
 *    the agfl for the ag in which the blocks live: sector size
 *    the superblock for the free block count: sector size
 *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
 */
STATIC uint
xfs_calc_link_reservation(
	struct xfs_mount	*mp)
{
	overhead += xfs_calc_iunlink_remove_reservation(mp);
	t1 = xfs_calc_inode_res(mp, 2) +
	     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
	t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
	     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
			      XFS_FSB_TO_B(mp, 1));

	if (xfs_has_parent(mp)) {
		t3 = resp->tr_attrsetm.tr_logres;
		overhead += xfs_calc_pptr_link_overhead();
	}

Does t1 correspond to "For creating a link"?  Does t2 correspond to "And
the bmap_finish..." ?  t3 isn't even in the comment.  To me it'd be much
clearer to write:

	/* Linking an unlinked inode into the directory tree */
	overhead += xfs_calc_iunlink_remove_reservation(mp);

	/*
	 * For creating a link to an inode:
	 *    the parent directory inode: inode size
	 *    the linked inode: inode size
	 *    the directory btree could split: (max depth + v2) * dir block size
	 *    the directory bmap btree could join or split: (max depth + v2) * blocksize
	 */
	t1 = xfs_calc_inode_res(mp, 2) +
	     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));

	/*
	 * And the bmap_finish transaction can free some bmap blocks giving:
	 *    the agf for the ag in which the blocks live: sector size
	 *    the agfl for the ag in which the blocks live: sector size
	 *    the superblock for the free block count: sector size
	 *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
	 */
	t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
	     xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),

	/*
	 * If parent pointers are enabled, save space to add a pptr
	 * xattr and the space to log the pptr items.
	 */
	if (xfs_has_parent(mp)) {
		t3 = resp->tr_attrsetm.tr_logres;
		overhead += xfs_calc_pptr_link_overhead();
	}

	return overhead + max3(t1, t2, t3);

calc_link isn't too bad, but truncate is more difficult to figure out.

> efficiently as possible (as we already do in xfs_trans_resv.c).

The downside of moving it to xfs_trans_resv.c is that now you've
separated the code that actually does the reaping from the code that
figures out the dynamic limits based on how reaping can happen.

> > +	trace_xreap_agextent_max_deferred_reaps(sc->tp, per_intent, ret);
> 
> If it is calculated at mount time, this can be emitted along with
> all the other log reservations calculated at mount time.
> 
> > +	return max(1, ret);
> 
> Why? tp->t_logres should always be much greater than the intent
> size. How will this ever evaluate as zero without there being some
> other serious log/transaction reservation bug elsewhere in the code?
> 
> i.e. if we can get zero here, that needs ASSERTS and/or WARN_ON
> output and a filesystem shutdown because there is something
> seriously wrong occurring in the filesystem.

Ok.  I'll just shut down the fs then.

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-04-08 23:13 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-03 19:12 [PATCH] xfs: compute the maximum repair reaping defer intent chain length Darrick J. Wong
2025-04-04  9:16 ` John Garry
2025-04-04 16:09   ` Darrick J. Wong
2025-04-04 16:36     ` John Garry
2025-04-06 17:22       ` Darrick J. Wong
2025-04-08 22:14 ` Dave Chinner
2025-04-08 23:13   ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox