[PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions

linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
@ 2023-04-24 22:51 Wengang Wang
  2023-05-02 22:41 ` Wengang Wang
  2023-05-12 18:24 ` Darrick J. Wong
  0 siblings, 2 replies; 16+ messages in thread
From: Wengang Wang @ 2023-04-24 22:51 UTC (permalink / raw)
  To: linux-xfs; +Cc: wen.gang.wang

To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
The orignal hehaviour for freeing extents in an EFI is that the extents in
the EFI were processed one by one in the same transaction. When the second
and subsequent extents are being processed, we have produced busy extents for
previous extents. If the second and subsequent extents are from the same AG
as the busy extents are, we have the risk of deadlock when allocating AGFL
blocks. A typical calltrace for the deadlock is like this:

	#0	context_switch() kernel/sched/core.c:3881
	#1	__schedule() kernel/sched/core.c:5111
	#2	schedule() kernel/sched/core.c:5186
	#3	xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
	#4	xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
	#5	xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
	#6	xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
	#7	xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
	#8	__xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
	#9	xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
	#10	xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
	#11	xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
	#12	xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
	#13	xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
	#14	xfs_log_mount_finish() fs/xfs/xfs_log.c:764
	#15	xfs_mountfs() fs/xfs/xfs_mount.c:978
	#16	xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
	#17	mount_bdev() fs/super.c:1417
	#18	xfs_fs_mount() fs/xfs/xfs_super.c:1985
	#19	legacy_get_tree() fs/fs_context.c:647
	#20	vfs_get_tree() fs/super.c:1547
	#21	do_new_mount() fs/namespace.c:2843
	#22	do_mount() fs/namespace.c:3163
	#23	ksys_mount() fs/namespace.c:3372
	#24	__do_sys_mount() fs/namespace.c:3386
	#25	__se_sys_mount() fs/namespace.c:3383
	#26	__x64_sys_mount() fs/namespace.c:3383
	#27	do_syscall_64() arch/x86/entry/common.c:296
	#28	entry_SYSCALL_64() arch/x86/entry/entry_64.S:180

The deadlock could happen at both IO time and log recover time.

To avoid above deadlock, this patch changes the extent free procedure.
1) it always let the first extent from the EFI go (no change).
2) increase the (new) AG counter when it let a extent go.
3) for the 2nd+ extents, if the owning AGs ready have pending extents
   don't let the extent go with current transaction. Instead, move the
   extent in question and subsequent extents to a new EFI and try the new
   EFI again with new transaction (by rolling current transaction).
4) for the EFD to orginal EFI, fill it with all the extents from the original
   EFI.
5) though the new EFI is placed after original EFD, it's safe as they are in
   same in-memory transaction.
6) The new AG counter for pending extent freeings is decremented after the
   log items in in-memory transaction is committed to CIL.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
---
 fs/xfs/libxfs/xfs_ag.c    |   1 +
 fs/xfs/libxfs/xfs_ag.h    |   5 ++
 fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_log_cil.c      |  24 ++++++++-
 4 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 86696a1c6891..61ef61e05668 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -378,6 +378,7 @@ xfs_initialize_perag(
 		pag->pagb_tree = RB_ROOT;
 #endif /* __KERNEL__ */
 
+		atomic_set(&pag->pag_nr_pending_extents, 0);
 		error = xfs_buf_hash_init(pag);
 		if (error)
 			goto out_remove_pag;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 5e18536dfdce..5950bc36a32c 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -82,6 +82,11 @@ struct xfs_perag {
 	uint16_t	pag_sick;
 	spinlock_t	pag_state_lock;
 
+	/*
+	 * Number of concurrent extent freeings (not committed to CIL yet)
+	 * on this AG.
+	 */
+	atomic_t	pag_nr_pending_extents;
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 	unsigned int	pagb_gen;	/* generation count for pagb_tree */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 011b50469301..1dbf36d9c1c9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -336,6 +336,75 @@ xfs_trans_get_efd(
 	return efdp;
 }
 
+/*
+ * Fill the EFD with all extents from the EFI and set the counter.
+ * Note: the EFD should comtain at least one extents already.
+ */
+static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
+{
+	struct xfs_efi_log_item	*efip = efdp->efd_efip;
+	uint			i;
+
+	i = efdp->efd_next_extent;
+	ASSERT(i > 0);
+	for (; i < efip->efi_format.efi_nextents; i++) {
+		efdp->efd_format.efd_extents[i] =
+			efip->efi_format.efi_extents[i];
+	}
+	efdp->efd_next_extent = i;
+}
+
+/*
+ * Check if xefi is the first in the efip.
+ * Returns true if so, ad false otherwise
+ */
+static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
+				  struct xfs_extent_free_item *xefi)
+{
+	return efip->efi_format.efi_extents[0].ext_start ==
+					xefi->xefi_startblock;
+}
+
+/*
+ * Check if the xefi needs to be in a new transaction.
+ * efip is the containing EFI of xefi.
+ * Return true if so, false otherwise.
+ */
+static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
+				    struct xfs_efi_log_item *efip,
+				    struct xfs_extent_free_item *xefi)
+{
+	bool			ret = true;
+	int			nr_pre;
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+
+	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
+	pag = xfs_perag_get(mp, agno);
+	/* The first extent in EFI is always OK to go */
+	if (xfs_is_first_extent_in_efi(efip, xefi)) {
+		atomic_inc(&pag->pag_nr_pending_extents);
+		ret = false;
+		goto out_put;
+	}
+
+	/*
+	 * Now the extent is the 2nd or subsequent in the efip. We need
+	 * new transaction if the AG already has busy extents pending.
+	 */
+	nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
+	/* No prevoius pending extent freeing to this AG */
+	if (nr_pre == 0) {
+		ret = false;
+		goto out_put;
+	}
+
+	atomic_dec(&pag->pag_nr_pending_extents);
+out_put:
+	xfs_perag_put(pag);
+	return ret;
+}
+
 /*
  * Free an extent and log it to the EFD. Note that the transaction is marked
  * dirty regardless of whether the extent free succeeds or fails to support the
@@ -356,6 +425,28 @@ xfs_trans_free_extent(
 	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
 							xefi->xefi_startblock);
 	int				error;
+	struct xfs_efi_log_item		*efip = efdp->efd_efip;
+
+	if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
+		/*
+		 * This should be the 2nd+ extent, we don't have to mark the
+		 * transaction and efd dirty, those are already done with the
+		 * first extent.
+		 */
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
+		ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
+
+		xfs_fill_efd_with_efi(efdp);
+
+		/*
+		 * A preious extent in same AG is processed with the current
+		 * transaction. To avoid possible AGFL allocation deadlock,
+		 * we roll the transaction and then restart with this extent
+		 * with new transaction.
+		 */
+		return -EAGAIN;
+	}
 
 	oinfo.oi_owner = xefi->xefi_owner;
 	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
@@ -369,6 +460,13 @@ xfs_trans_free_extent(
 	error = __xfs_free_extent(tp, xefi->xefi_startblock,
 			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
 			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+	if (error) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, agno);
+		atomic_dec(&pag->pag_nr_pending_extents);
+		xfs_perag_put(pag);
+	}
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
 	 * transaction is aborted, which:
@@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 
 	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	if (error != -EAGAIN)
+		kmem_cache_free(xfs_extfree_item_cache, xefi);
 	return error;
 }
 
@@ -632,7 +731,15 @@ xfs_efi_item_recover(
 		fake.xefi_startblock = extp->ext_start;
 		fake.xefi_blockcount = extp->ext_len;
 
-		error = xfs_trans_free_extent(tp, efdp, &fake);
+		if (error == 0)
+			error = xfs_trans_free_extent(tp, efdp, &fake);
+
+		if (error == -EAGAIN) {
+			ASSERT(i > 0);
+			xfs_free_extent_later(tp, fake.xefi_startblock,
+			fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER);
+			continue;
+		}
 		if (error == -EFSCORRUPTED)
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index eccbfb99e894..97eda4487db0 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -16,6 +16,7 @@
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_trace.h"
+#include "xfs_ag.h"
 
 struct workqueue_struct *xfs_discard_wq;
 
@@ -643,8 +644,29 @@ xlog_cil_insert_items(
 		cilpcp->space_used += len;
 	}
 	/* attach the transaction to the CIL if it has any busy extents */
-	if (!list_empty(&tp->t_busy))
+	if (!list_empty(&tp->t_busy)) {
+		struct xfs_perag	*last_pag = NULL;
+		xfs_agnumber_t		last_agno = -1;
+		struct xfs_extent_busy	*ebp;
+
+		/*
+		 * Pending extent freeings are committed to CIL, now it's
+		 * to let other extent freeing on same AG go.
+		 */
+		list_for_each_entry(ebp, &tp->t_busy, list) {
+			if (ebp->agno != last_agno) {
+				last_agno = ebp->agno;
+				if (last_pag)
+					xfs_perag_put(last_pag);
+				last_pag = xfs_perag_get(tp->t_mountp, last_agno);
+			}
+			atomic_dec(&last_pag->pag_nr_pending_extents);
+		}
+		if (last_pag)
+			xfs_perag_put(last_pag);
+
 		list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
+	}
 
 	/*
 	 * Now update the order of everything modified in the transaction
-- 
2.21.0 (Apple Git-122.2)


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-04-24 22:51 [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions Wengang Wang
@ 2023-05-02 22:41 ` Wengang Wang
  2023-05-12 18:24 ` Darrick J. Wong
  1 sibling, 0 replies; 16+ messages in thread
From: Wengang Wang @ 2023-05-02 22:41 UTC (permalink / raw)
  To: linux-xfs@vger.kernel.org

Hi Dave, did you get a chance to look at this?

thanks,
wengang

> On Apr 24, 2023, at 3:51 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote:
> 
> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
> The orignal hehaviour for freeing extents in an EFI is that the extents in
> the EFI were processed one by one in the same transaction. When the second
> and subsequent extents are being processed, we have produced busy extents for
> previous extents. If the second and subsequent extents are from the same AG
> as the busy extents are, we have the risk of deadlock when allocating AGFL
> blocks. A typical calltrace for the deadlock is like this:
> 
> #0 context_switch() kernel/sched/core.c:3881
> #1 __schedule() kernel/sched/core.c:5111
> #2 schedule() kernel/sched/core.c:5186
> #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
> #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
> #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
> #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
> #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
> #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
> #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
> #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
> #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
> #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
> #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
> #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764
> #15 xfs_mountfs() fs/xfs/xfs_mount.c:978
> #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
> #17 mount_bdev() fs/super.c:1417
> #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985
> #19 legacy_get_tree() fs/fs_context.c:647
> #20 vfs_get_tree() fs/super.c:1547
> #21 do_new_mount() fs/namespace.c:2843
> #22 do_mount() fs/namespace.c:3163
> #23 ksys_mount() fs/namespace.c:3372
> #24 __do_sys_mount() fs/namespace.c:3386
> #25 __se_sys_mount() fs/namespace.c:3383
> #26 __x64_sys_mount() fs/namespace.c:3383
> #27 do_syscall_64() arch/x86/entry/common.c:296
> #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
> 
> The deadlock could happen at both IO time and log recover time.
> 
> To avoid above deadlock, this patch changes the extent free procedure.
> 1) it always let the first extent from the EFI go (no change).
> 2) increase the (new) AG counter when it let a extent go.
> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
>   don't let the extent go with current transaction. Instead, move the
>   extent in question and subsequent extents to a new EFI and try the new
>   EFI again with new transaction (by rolling current transaction).
> 4) for the EFD to orginal EFI, fill it with all the extents from the original
>   EFI.
> 5) though the new EFI is placed after original EFD, it's safe as they are in
>   same in-memory transaction.
> 6) The new AG counter for pending extent freeings is decremented after the
>   log items in in-memory transaction is committed to CIL.
> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
> fs/xfs/libxfs/xfs_ag.c    |   1 +
> fs/xfs/libxfs/xfs_ag.h    |   5 ++
> fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
> fs/xfs/xfs_log_cil.c      |  24 ++++++++-
> 4 files changed, 138 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 86696a1c6891..61ef61e05668 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -378,6 +378,7 @@ xfs_initialize_perag(
> pag->pagb_tree = RB_ROOT;
> #endif /* __KERNEL__ */
> 
> + atomic_set(&pag->pag_nr_pending_extents, 0);
> error = xfs_buf_hash_init(pag);
> if (error)
> goto out_remove_pag;
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 5e18536dfdce..5950bc36a32c 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -82,6 +82,11 @@ struct xfs_perag {
> uint16_t pag_sick;
> spinlock_t pag_state_lock;
> 
> + /*
> + * Number of concurrent extent freeings (not committed to CIL yet)
> + * on this AG.
> + */
> + atomic_t pag_nr_pending_extents;
> spinlock_t pagb_lock; /* lock for pagb_tree */
> struct rb_root pagb_tree; /* ordered tree of busy extents */
> unsigned int pagb_gen; /* generation count for pagb_tree */
> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> index 011b50469301..1dbf36d9c1c9 100644
> --- a/fs/xfs/xfs_extfree_item.c
> +++ b/fs/xfs/xfs_extfree_item.c
> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
> return efdp;
> }
> 
> +/*
> + * Fill the EFD with all extents from the EFI and set the counter.
> + * Note: the EFD should comtain at least one extents already.
> + */
> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
> +{
> + struct xfs_efi_log_item *efip = efdp->efd_efip;
> + uint i;
> +
> + i = efdp->efd_next_extent;
> + ASSERT(i > 0);
> + for (; i < efip->efi_format.efi_nextents; i++) {
> + efdp->efd_format.efd_extents[i] =
> + efip->efi_format.efi_extents[i];
> + }
> + efdp->efd_next_extent = i;
> +}
> +
> +/*
> + * Check if xefi is the first in the efip.
> + * Returns true if so, ad false otherwise
> + */
> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
> +  struct xfs_extent_free_item *xefi)
> +{
> + return efip->efi_format.efi_extents[0].ext_start ==
> + xefi->xefi_startblock;
> +}
> +
> +/*
> + * Check if the xefi needs to be in a new transaction.
> + * efip is the containing EFI of xefi.
> + * Return true if so, false otherwise.
> + */
> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
> +    struct xfs_efi_log_item *efip,
> +    struct xfs_extent_free_item *xefi)
> +{
> + bool ret = true;
> + int nr_pre;
> + xfs_agnumber_t agno;
> + struct xfs_perag *pag;
> +
> + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
> + pag = xfs_perag_get(mp, agno);
> + /* The first extent in EFI is always OK to go */
> + if (xfs_is_first_extent_in_efi(efip, xefi)) {
> + atomic_inc(&pag->pag_nr_pending_extents);
> + ret = false;
> + goto out_put;
> + }
> +
> + /*
> + * Now the extent is the 2nd or subsequent in the efip. We need
> + * new transaction if the AG already has busy extents pending.
> + */
> + nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
> + /* No prevoius pending extent freeing to this AG */
> + if (nr_pre == 0) {
> + ret = false;
> + goto out_put;
> + }
> +
> + atomic_dec(&pag->pag_nr_pending_extents);
> +out_put:
> + xfs_perag_put(pag);
> + return ret;
> +}
> +
> /*
>  * Free an extent and log it to the EFD. Note that the transaction is marked
>  * dirty regardless of whether the extent free succeeds or fails to support the
> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
> xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
> xefi->xefi_startblock);
> int error;
> + struct xfs_efi_log_item *efip = efdp->efd_efip;
> +
> + if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
> + /*
> + * This should be the 2nd+ extent, we don't have to mark the
> + * transaction and efd dirty, those are already done with the
> + * first extent.
> + */
> + ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
> + ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
> + ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
> +
> + xfs_fill_efd_with_efi(efdp);
> +
> + /*
> + * A preious extent in same AG is processed with the current
> + * transaction. To avoid possible AGFL allocation deadlock,
> + * we roll the transaction and then restart with this extent
> + * with new transaction.
> + */
> + return -EAGAIN;
> + }
> 
> oinfo.oi_owner = xefi->xefi_owner;
> if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
> error = __xfs_free_extent(tp, xefi->xefi_startblock,
> xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
> xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
> + if (error) {
> + struct xfs_perag *pag;
> +
> + pag = xfs_perag_get(mp, agno);
> + atomic_dec(&pag->pag_nr_pending_extents);
> + xfs_perag_put(pag);
> + }
> /*
> * Mark the transaction dirty, even on error. This ensures the
> * transaction is aborted, which:
> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> 
> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> - kmem_cache_free(xfs_extfree_item_cache, xefi);
> + if (error != -EAGAIN)
> + kmem_cache_free(xfs_extfree_item_cache, xefi);
> return error;
> }
> 
> @@ -632,7 +731,15 @@ xfs_efi_item_recover(
> fake.xefi_startblock = extp->ext_start;
> fake.xefi_blockcount = extp->ext_len;
> 
> - error = xfs_trans_free_extent(tp, efdp, &fake);
> + if (error == 0)
> + error = xfs_trans_free_extent(tp, efdp, &fake);
> +
> + if (error == -EAGAIN) {
> + ASSERT(i > 0);
> + xfs_free_extent_later(tp, fake.xefi_startblock,
> + fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER);
> + continue;
> + }
> if (error == -EFSCORRUPTED)
> XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
> extp, sizeof(*extp));
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index eccbfb99e894..97eda4487db0 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -16,6 +16,7 @@
> #include "xfs_log.h"
> #include "xfs_log_priv.h"
> #include "xfs_trace.h"
> +#include "xfs_ag.h"
> 
> struct workqueue_struct *xfs_discard_wq;
> 
> @@ -643,8 +644,29 @@ xlog_cil_insert_items(
> cilpcp->space_used += len;
> }
> /* attach the transaction to the CIL if it has any busy extents */
> - if (!list_empty(&tp->t_busy))
> + if (!list_empty(&tp->t_busy)) {
> + struct xfs_perag *last_pag = NULL;
> + xfs_agnumber_t last_agno = -1;
> + struct xfs_extent_busy *ebp;
> +
> + /*
> + * Pending extent freeings are committed to CIL, now it's
> + * to let other extent freeing on same AG go.
> + */
> + list_for_each_entry(ebp, &tp->t_busy, list) {
> + if (ebp->agno != last_agno) {
> + last_agno = ebp->agno;
> + if (last_pag)
> + xfs_perag_put(last_pag);
> + last_pag = xfs_perag_get(tp->t_mountp, last_agno);
> + }
> + atomic_dec(&last_pag->pag_nr_pending_extents);
> + }
> + if (last_pag)
> + xfs_perag_put(last_pag);
> +
> list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
> + }
> 
> /*
> * Now update the order of everything modified in the transaction
> -- 
> 2.21.0 (Apple Git-122.2)
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-04-24 22:51 [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions Wengang Wang
  2023-05-02 22:41 ` Wengang Wang
@ 2023-05-12 18:24 ` Darrick J. Wong
  2023-05-12 19:55   ` Wengang Wang
  1 sibling, 1 reply; 16+ messages in thread
From: Darrick J. Wong @ 2023-05-12 18:24 UTC (permalink / raw)
  To: Wengang Wang; +Cc: linux-xfs

Sorry for the slow response, I thought Dave would've responded by now.

On Mon, Apr 24, 2023 at 03:51:02PM -0700, Wengang Wang wrote:
> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
> The orignal hehaviour for freeing extents in an EFI is that the extents in
> the EFI were processed one by one in the same transaction. When the second
> and subsequent extents are being processed, we have produced busy extents for
> previous extents. If the second and subsequent extents are from the same AG
> as the busy extents are, we have the risk of deadlock when allocating AGFL
> blocks. A typical calltrace for the deadlock is like this:

It's been a few weeks, so let me try to remember what this is all about.
You have one EFI containing multiple extents to free:

{agno: 3, agbno: X, ... }
{agno: 3, agbno: Y, ... }

AG 3 is nearly full, so we free the first extent (3/X), which adds it to
the bnobt, records the newly freed extent in the extentbusylist, and
attaches the record to the transaction's busy extent list.

Then we move on to the second record in the EFI item.  We want to free
(3/Y), but whoops the AGFL isn't big enough to handle maximal expansion
of the bnobt/cntbt btrees.

So we try to fix the AG 3 AGFL to be long enough.  We can find the space
in the bnobt, but the only space available is the (3/X) that we put
there during the last step.  That space is still(!) busy and still
attached to the transaction, so the CIL cannot clear it.  The AGFL fixer
sees that the space is busy, so it waits for it... and now we're dead in
the water.

The key to this deadlock is a thread waiting on its own uncommitted busy
extent, right?

> 
> 	#0	context_switch() kernel/sched/core.c:3881
> 	#1	__schedule() kernel/sched/core.c:5111
> 	#2	schedule() kernel/sched/core.c:5186
> 	#3	xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
> 	#4	xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
> 	#5	xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
> 	#6	xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
> 	#7	xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
> 	#8	__xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
> 	#9	xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
> 	#10	xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
> 	#11	xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
> 	#12	xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
> 	#13	xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
> 	#14	xfs_log_mount_finish() fs/xfs/xfs_log.c:764
> 	#15	xfs_mountfs() fs/xfs/xfs_mount.c:978
> 	#16	xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
> 	#17	mount_bdev() fs/super.c:1417
> 	#18	xfs_fs_mount() fs/xfs/xfs_super.c:1985
> 	#19	legacy_get_tree() fs/fs_context.c:647
> 	#20	vfs_get_tree() fs/super.c:1547
> 	#21	do_new_mount() fs/namespace.c:2843
> 	#22	do_mount() fs/namespace.c:3163
> 	#23	ksys_mount() fs/namespace.c:3372
> 	#24	__do_sys_mount() fs/namespace.c:3386
> 	#25	__se_sys_mount() fs/namespace.c:3383
> 	#26	__x64_sys_mount() fs/namespace.c:3383
> 	#27	do_syscall_64() arch/x86/entry/common.c:296
> 	#28	entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
> 
> The deadlock could happen at both IO time and log recover time.
> 
> To avoid above deadlock, this patch changes the extent free procedure.
> 1) it always let the first extent from the EFI go (no change).
> 2) increase the (new) AG counter when it let a extent go.
> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
>    don't let the extent go with current transaction. Instead, move the
>    extent in question and subsequent extents to a new EFI and try the new
>    EFI again with new transaction (by rolling current transaction).
> 4) for the EFD to orginal EFI, fill it with all the extents from the original
>    EFI.
> 5) though the new EFI is placed after original EFD, it's safe as they are in
>    same in-memory transaction.
> 6) The new AG counter for pending extent freeings is decremented after the
>    log items in in-memory transaction is committed to CIL.

So the solution you're proposing is a per-AG counter of however many
threads have started an EFI free.  If the second (or subsequent) EFI
free encounters an AG with a nonzero counter, it'll return EAGAIN to
cycle the transaction.  The cycling is critical to getting the busy
extent to the CIL so the log can process it and make it unbusy.  If the
AG wasn't contended (pag_nr_pending_extents was 0), it proceeds with the
freeing, having bumped the per-AG counter.  Right?

Therefore, the above deadlock sequence now becomes:

1. Free (3/X), placing (3/X) on the transaction's busy list.

2. Start trying to free (3/Y), notice that AG 3 has elevated
pag_nr_pending_extents, return EAGAIN

3. Roll transaction, which moves (3/X) to the CIL and gets us a fresh
transaction

4. Try to free (3/Y) again

At step 4, if pag_nr_pending_extents is still elevated, does that
mean we return EAGAIN and keep rolling the transaction until it's not
elevated?  Shouldn't we be able to proceed at step 4, even if we end up
waiting on the log to clear (3/X) from the busy list?

What happens if the log actually clears (3/X) from the busy list after
step 3 but then some other thread starts processing an EFI for (3/Z)?
Does that cause this thread to roll repeatedly waiting for another
thread's EFI to clear?

IOWs, I'm not sure we can always make forward progress with this scheme,
and I'd prefer to avoid introducing more global state.

> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_ag.c    |   1 +
>  fs/xfs/libxfs/xfs_ag.h    |   5 ++
>  fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_log_cil.c      |  24 ++++++++-
>  4 files changed, 138 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 86696a1c6891..61ef61e05668 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -378,6 +378,7 @@ xfs_initialize_perag(
>  		pag->pagb_tree = RB_ROOT;
>  #endif /* __KERNEL__ */
>  
> +		atomic_set(&pag->pag_nr_pending_extents, 0);
>  		error = xfs_buf_hash_init(pag);
>  		if (error)
>  			goto out_remove_pag;
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 5e18536dfdce..5950bc36a32c 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -82,6 +82,11 @@ struct xfs_perag {
>  	uint16_t	pag_sick;
>  	spinlock_t	pag_state_lock;
>  
> +	/*
> +	 * Number of concurrent extent freeings (not committed to CIL yet)
> +	 * on this AG.
> +	 */
> +	atomic_t	pag_nr_pending_extents;
>  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
>  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
>  	unsigned int	pagb_gen;	/* generation count for pagb_tree */
> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> index 011b50469301..1dbf36d9c1c9 100644
> --- a/fs/xfs/xfs_extfree_item.c
> +++ b/fs/xfs/xfs_extfree_item.c
> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
>  	return efdp;
>  }
>  
> +/*
> + * Fill the EFD with all extents from the EFI and set the counter.
> + * Note: the EFD should comtain at least one extents already.
> + */
> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
> +{
> +	struct xfs_efi_log_item	*efip = efdp->efd_efip;
> +	uint			i;
> +
> +	i = efdp->efd_next_extent;
> +	ASSERT(i > 0);
> +	for (; i < efip->efi_format.efi_nextents; i++) {
> +		efdp->efd_format.efd_extents[i] =
> +			efip->efi_format.efi_extents[i];

Why is it necessary to fill the EFD mapping structures?  Nobody /ever/
looks at those; the only part that matters to log recovery is matching
efd.efd_efi_id to efi.efi_id.

But, I guess it looks funny to requeue an EFI and have the EFD for the
old EFI missing a bunch of fields.

> +	}
> +	efdp->efd_next_extent = i;
> +}
> +
> +/*
> + * Check if xefi is the first in the efip.
> + * Returns true if so, ad false otherwise
> + */
> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
> +				  struct xfs_extent_free_item *xefi)
> +{
> +	return efip->efi_format.efi_extents[0].ext_start ==
> +					xefi->xefi_startblock;
> +}
> +
> +/*
> + * Check if the xefi needs to be in a new transaction.
> + * efip is the containing EFI of xefi.
> + * Return true if so, false otherwise.
> + */
> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
> +				    struct xfs_efi_log_item *efip,
> +				    struct xfs_extent_free_item *xefi)
> +{
> +	bool			ret = true;
> +	int			nr_pre;
> +	xfs_agnumber_t		agno;
> +	struct xfs_perag	*pag;
> +
> +	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
> +	pag = xfs_perag_get(mp, agno);
> +	/* The first extent in EFI is always OK to go */
> +	if (xfs_is_first_extent_in_efi(efip, xefi)) {
> +		atomic_inc(&pag->pag_nr_pending_extents);
> +		ret = false;
> +		goto out_put;
> +	}
> +
> +	/*
> +	 * Now the extent is the 2nd or subsequent in the efip. We need
> +	 * new transaction if the AG already has busy extents pending.
> +	 */
> +	nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
> +	/* No prevoius pending extent freeing to this AG */
> +	if (nr_pre == 0) {
> +		ret = false;
> +		goto out_put;
> +	}
> +
> +	atomic_dec(&pag->pag_nr_pending_extents);
> +out_put:
> +	xfs_perag_put(pag);
> +	return ret;
> +}
> +
>  /*
>   * Free an extent and log it to the EFD. Note that the transaction is marked
>   * dirty regardless of whether the extent free succeeds or fails to support the
> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
>  	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
>  							xefi->xefi_startblock);
>  	int				error;
> +	struct xfs_efi_log_item		*efip = efdp->efd_efip;
> +
> +	if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
> +		/*
> +		 * This should be the 2nd+ extent, we don't have to mark the
> +		 * transaction and efd dirty, those are already done with the
> +		 * first extent.
> +		 */
> +		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
> +		ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
> +		ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
> +
> +		xfs_fill_efd_with_efi(efdp);
> +
> +		/*
> +		 * A preious extent in same AG is processed with the current
> +		 * transaction. To avoid possible AGFL allocation deadlock,
> +		 * we roll the transaction and then restart with this extent
> +		 * with new transaction.
> +		 */
> +		return -EAGAIN;
> +	}
>  
>  	oinfo.oi_owner = xefi->xefi_owner;
>  	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
>  	error = __xfs_free_extent(tp, xefi->xefi_startblock,
>  			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
>  			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
> +	if (error) {
> +		struct xfs_perag	*pag;
> +
> +		pag = xfs_perag_get(mp, agno);
> +		atomic_dec(&pag->pag_nr_pending_extents);
> +		xfs_perag_put(pag);
> +	}
>  	/*
>  	 * Mark the transaction dirty, even on error. This ensures the
>  	 * transaction is aborted, which:
> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(

This function comes with an unused **state variable:

STATIC int
xfs_extent_free_finish_item(
	struct xfs_trans		*tp,
	struct xfs_log_item		*done,
	struct list_head		*item,
	struct xfs_btree_cur		**state)

What if, after each xfs_trans_free_extent call, we stuff *state with
xefi_startblock's AG?

Then, upon entry to xfs_extent_free_finish_item, we compare *state with
the xefi item and return EAGAIN if we're processing an EFI from the same
AG as the previous call to xfs_extent_free_finish_item?  Something like
this (totally untested):

STATIC int
xfs_extent_free_finish_item(
	struct xfs_trans		*tp,
	struct xfs_log_item		*done,
	struct list_head		*item,
	struct xfs_btree_cur		**state)
{
	struct xfs_extent_free_item	*xefi;
	int				error;

	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);

	if (*state) {
		struct xfs_perag	*oldpag = *(struct xfs_perag **)state;

		/*
		 * If we're freeing two extents from the same AG, we
		 * must roll the transaction between the two extents to
		 * avoid a livelock resulting from AGFL fixing waiting
		 * on the extent that we just freed to become unbusy.
		 */
		if (oldpag == xefi->xefi_pag) {
			xfs_fill_efd_with_efi(EFD_ITEM(done));
			xfs_perag_put(oldpag);
			return -EAGAIN;
		}
		xfs_perag_put(oldpag);
	}

	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
	if (!error)
		*state = (void *)xfs_perag_hold(xefi->xefi_pag);

	xfs_extent_free_put_group(xefi);
	kmem_cache_free(xfs_extfree_item_cache, xefi);
	return error;
}

Obviously, you now need a ->finish_cleanup function to drop the perag
reference that may be stashed in @state.  This (I think) avoids the
livelock by ensuring that xfs_trans_free_extent always starts with a
transaction that does not contain any busy extents from the same AG that
it's acting on, right?

A simpler version of the above would make it so that we always roll
between EFI records and don't have to manage perag references:

#define XFS_EFI_STATE_ROLL		((struct xfs_btree_cur *)1)
STATIC int
xfs_extent_free_finish_item(
	struct xfs_trans		*tp,
	struct xfs_log_item		*done,
	struct list_head		*item,
	struct xfs_btree_cur		**state)
{
	struct xfs_extent_free_item	*xefi;
	int				error;

	/*
	 * If we're freeing two extents, roll the transaction between
	 * the two extents to avoid a livelock resulting from AGFL
	 * fixing waiting on the extent that we just freed to become
	 * unbusy.  It's not necessary to roll if the previous and
	 * current EFI record point to different AGs, but this
	 * simplifies the state tracking.
	 */
	if (*state == XFS_EFI_STATE_ROLL) {
		xfs_fill_efd_with_efi(EFD_ITEM(done));
		*state = NULL;
		return -EAGAIN;
	}

	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);

	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
	if (!error)
		*state = XFS_EFI_STATE_ROLL;

	xfs_extent_free_put_group(xefi);
	kmem_cache_free(xfs_extfree_item_cache, xefi);
	return error;
}

(Did you and Dave already talk about this?)

--D

>  	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
>  
>  	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> -	kmem_cache_free(xfs_extfree_item_cache, xefi);
> +	if (error != -EAGAIN)
> +		kmem_cache_free(xfs_extfree_item_cache, xefi);
>  	return error;
>  }
>  
> @@ -632,7 +731,15 @@ xfs_efi_item_recover(
>  		fake.xefi_startblock = extp->ext_start;
>  		fake.xefi_blockcount = extp->ext_len;
>  
> -		error = xfs_trans_free_extent(tp, efdp, &fake);
> +		if (error == 0)
> +			error = xfs_trans_free_extent(tp, efdp, &fake);
> +
> +		if (error == -EAGAIN) {
> +			ASSERT(i > 0);
> +			xfs_free_extent_later(tp, fake.xefi_startblock,
> +			fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER);
> +			continue;
> +		}
>  		if (error == -EFSCORRUPTED)
>  			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
>  					extp, sizeof(*extp));
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index eccbfb99e894..97eda4487db0 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -16,6 +16,7 @@
>  #include "xfs_log.h"
>  #include "xfs_log_priv.h"
>  #include "xfs_trace.h"
> +#include "xfs_ag.h"
>  
>  struct workqueue_struct *xfs_discard_wq;
>  
> @@ -643,8 +644,29 @@ xlog_cil_insert_items(
>  		cilpcp->space_used += len;
>  	}
>  	/* attach the transaction to the CIL if it has any busy extents */
> -	if (!list_empty(&tp->t_busy))
> +	if (!list_empty(&tp->t_busy)) {
> +		struct xfs_perag	*last_pag = NULL;
> +		xfs_agnumber_t		last_agno = -1;
> +		struct xfs_extent_busy	*ebp;
> +
> +		/*
> +		 * Pending extent freeings are committed to CIL, now it's
> +		 * to let other extent freeing on same AG go.
> +		 */
> +		list_for_each_entry(ebp, &tp->t_busy, list) {
> +			if (ebp->agno != last_agno) {
> +				last_agno = ebp->agno;
> +				if (last_pag)
> +					xfs_perag_put(last_pag);
> +				last_pag = xfs_perag_get(tp->t_mountp, last_agno);
> +			}
> +			atomic_dec(&last_pag->pag_nr_pending_extents);
> +		}
> +		if (last_pag)
> +			xfs_perag_put(last_pag);
> +
>  		list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
> +	}
>  
>  	/*
>  	 * Now update the order of everything modified in the transaction
> -- 
> 2.21.0 (Apple Git-122.2)
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-12 18:24 ` Darrick J. Wong
@ 2023-05-12 19:55   ` Wengang Wang
  2023-05-12 21:13     ` Darrick J. Wong
  0 siblings, 1 reply; 16+ messages in thread
From: Wengang Wang @ 2023-05-12 19:55 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs@vger.kernel.org

Hi Darrick,

> On May 12, 2023, at 11:24 AM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> Sorry for the slow response, I thought Dave would've responded by now.
> 
> On Mon, Apr 24, 2023 at 03:51:02PM -0700, Wengang Wang wrote:
>> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
>> The orignal hehaviour for freeing extents in an EFI is that the extents in
>> the EFI were processed one by one in the same transaction. When the second
>> and subsequent extents are being processed, we have produced busy extents for
>> previous extents. If the second and subsequent extents are from the same AG
>> as the busy extents are, we have the risk of deadlock when allocating AGFL
>> blocks. A typical calltrace for the deadlock is like this:
> 
> It's been a few weeks, so let me try to remember what this is all about.
> You have one EFI containing multiple extents to free:
> 
> {agno: 3, agbno: X, ... }
> {agno: 3, agbno: Y, ... }
> 
> AG 3 is nearly full, so we free the first extent (3/X), which adds it to
> the bnobt, records the newly freed extent in the extentbusylist, and
> attaches the record to the transaction's busy extent list.
> 
> Then we move on to the second record in the EFI item.  We want to free
> (3/Y), but whoops the AGFL isn't big enough to handle maximal expansion
> of the bnobt/cntbt btrees.
> 
> So we try to fix the AG 3 AGFL to be long enough.  We can find the space
> in the bnobt, but the only space available is the (3/X) that we put
> there during the last step.  That space is still(!) busy and still
> attached to the transaction, so the CIL cannot clear it.  The AGFL fixer
> sees that the space is busy, so it waits for it... and now we're dead in
> the water.
> 
> The key to this deadlock is a thread waiting on its own uncommitted busy
> extent, right?
> 

Yep, above is all correct.

>> 
>> #0 context_switch() kernel/sched/core.c:3881
>> #1 __schedule() kernel/sched/core.c:5111
>> #2 schedule() kernel/sched/core.c:5186
>> #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
>> #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
>> #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
>> #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
>> #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
>> #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
>> #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
>> #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
>> #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
>> #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
>> #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
>> #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764
>> #15 xfs_mountfs() fs/xfs/xfs_mount.c:978
>> #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
>> #17 mount_bdev() fs/super.c:1417
>> #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985
>> #19 legacy_get_tree() fs/fs_context.c:647
>> #20 vfs_get_tree() fs/super.c:1547
>> #21 do_new_mount() fs/namespace.c:2843
>> #22 do_mount() fs/namespace.c:3163
>> #23 ksys_mount() fs/namespace.c:3372
>> #24 __do_sys_mount() fs/namespace.c:3386
>> #25 __se_sys_mount() fs/namespace.c:3383
>> #26 __x64_sys_mount() fs/namespace.c:3383
>> #27 do_syscall_64() arch/x86/entry/common.c:296
>> #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
>> 
>> The deadlock could happen at both IO time and log recover time.
>> 
>> To avoid above deadlock, this patch changes the extent free procedure.
>> 1) it always let the first extent from the EFI go (no change).
>> 2) increase the (new) AG counter when it let a extent go.
>> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
>>   don't let the extent go with current transaction. Instead, move the
>>   extent in question and subsequent extents to a new EFI and try the new
>>   EFI again with new transaction (by rolling current transaction).
>> 4) for the EFD to orginal EFI, fill it with all the extents from the original
>>   EFI.
>> 5) though the new EFI is placed after original EFD, it's safe as they are in
>>   same in-memory transaction.
>> 6) The new AG counter for pending extent freeings is decremented after the
>>   log items in in-memory transaction is committed to CIL.
> 
> So the solution you're proposing is a per-AG counter of however many
> threads have started an EFI free.  If the second (or subsequent) EFI
> free encounters an AG with a nonzero counter, it'll return EAGAIN to
> cycle the transaction.  The cycling is critical to getting the busy
> extent to the CIL so the log can process it and make it unbusy.  If the
> AG wasn't contended (pag_nr_pending_extents was 0), it proceeds with the
> freeing, having bumped the per-AG counter.  Right?

Right.

> 
> Therefore, the above deadlock sequence now becomes:
> 
> 1. Free (3/X), placing (3/X) on the transaction's busy list.
> 
> 2. Start trying to free (3/Y), notice that AG 3 has elevated
> pag_nr_pending_extents, return EAGAIN
> 
> 3. Roll transaction, which moves (3/X) to the CIL and gets us a fresh
> transaction
> 
> 4. Try to free (3/Y) again
> 
> At step 4, if pag_nr_pending_extents is still elevated, does that
> mean we return EAGAIN and keep rolling the transaction until it's not
> elevated?  Shouldn't we be able to proceed at step 4, even if we end up
> waiting on the log to clear (3/X) from the busy list?

The pag_nr_pending_extents is expected to be decremented in step 3.
The changes in xlog_cil_insert_items() is supposed to do so. So at step 4,
for this cass {3/x, 3/y}, pag_nr_pending_extents should be decremented to zero,
thus step 4 can go (after that busy extent 3/x is cleared).

> 
> What happens if the log actually clears (3/X) from the busy list after
> step 3 but then some other thread starts processing an EFI for (3/Z)?

There is a rule in the patch that the first record in an EFI can always go (bumping
the per AG counter). That’s because the freeing the first record never has
the deadlock issue.

The 3/Y would go because now the 3/Y is the first record of the new EFI
containing only record of 3/Y.

For the other thread for 3/Z,  3/Z can go too because it’s the first record in
that EFI (bumping the per AG counter, now the counter could be 3 at most).


> Does that cause this thread to roll repeatedly waiting for another
> thread's EFI to clear?

Nope. As I said, the first record can always go. For the 2nd and subsequent
record, on retry, they would become the first record in the new FEI and it goes.

> 
> IOWs, I'm not sure we can always make forward progress with this scheme,
> and I'd prefer to avoid introducing more global state.

See above explanation.

> 
>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>> ---
>> fs/xfs/libxfs/xfs_ag.c    |   1 +
>> fs/xfs/libxfs/xfs_ag.h    |   5 ++
>> fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
>> fs/xfs/xfs_log_cil.c      |  24 ++++++++-
>> 4 files changed, 138 insertions(+), 3 deletions(-)
>> 
>> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
>> index 86696a1c6891..61ef61e05668 100644
>> --- a/fs/xfs/libxfs/xfs_ag.c
>> +++ b/fs/xfs/libxfs/xfs_ag.c
>> @@ -378,6 +378,7 @@ xfs_initialize_perag(
>> pag->pagb_tree = RB_ROOT;
>> #endif /* __KERNEL__ */
>> 
>> + atomic_set(&pag->pag_nr_pending_extents, 0);
>> error = xfs_buf_hash_init(pag);
>> if (error)
>> goto out_remove_pag;
>> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
>> index 5e18536dfdce..5950bc36a32c 100644
>> --- a/fs/xfs/libxfs/xfs_ag.h
>> +++ b/fs/xfs/libxfs/xfs_ag.h
>> @@ -82,6 +82,11 @@ struct xfs_perag {
>> uint16_t pag_sick;
>> spinlock_t pag_state_lock;
>> 
>> + /*
>> +  * Number of concurrent extent freeings (not committed to CIL yet)
>> +  * on this AG.
>> +  */
>> + atomic_t pag_nr_pending_extents;
>> spinlock_t pagb_lock; /* lock for pagb_tree */
>> struct rb_root pagb_tree; /* ordered tree of busy extents */
>> unsigned int pagb_gen; /* generation count for pagb_tree */
>> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
>> index 011b50469301..1dbf36d9c1c9 100644
>> --- a/fs/xfs/xfs_extfree_item.c
>> +++ b/fs/xfs/xfs_extfree_item.c
>> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
>> return efdp;
>> }
>> 
>> +/*
>> + * Fill the EFD with all extents from the EFI and set the counter.
>> + * Note: the EFD should comtain at least one extents already.
>> + */
>> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
>> +{
>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
>> + uint i;
>> +
>> + i = efdp->efd_next_extent;
>> + ASSERT(i > 0);
>> + for (; i < efip->efi_format.efi_nextents; i++) {
>> + efdp->efd_format.efd_extents[i] =
>> + efip->efi_format.efi_extents[i];
> 
> Why is it necessary to fill the EFD mapping structures?  Nobody /ever/
> looks at those; the only part that matters to log recovery is matching
> efd.efd_efi_id to efi.efi_id.

Yes, that’s for EFI/EFD matching for log recovery.
The records to be retried would be in a new EFI.

> 
> But, I guess it looks funny to requeue an EFI and have the EFD for the
> old EFI missing a bunch of fields.

The EFD to the original EFI is not missing anything, it is expected
to be exact the same as if all the records are processed.
By this way we move records to new EFI(s) to avoid the deadlock.

> 
>> + }
>> + efdp->efd_next_extent = i;
>> +}
>> +
>> +/*
>> + * Check if xefi is the first in the efip.
>> + * Returns true if so, ad false otherwise
>> + */
>> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
>> +   struct xfs_extent_free_item *xefi)
>> +{
>> + return efip->efi_format.efi_extents[0].ext_start ==
>> + xefi->xefi_startblock;
>> +}
>> +
>> +/*
>> + * Check if the xefi needs to be in a new transaction.
>> + * efip is the containing EFI of xefi.
>> + * Return true if so, false otherwise.
>> + */
>> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
>> +     struct xfs_efi_log_item *efip,
>> +     struct xfs_extent_free_item *xefi)
>> +{
>> + bool ret = true;
>> + int nr_pre;
>> + xfs_agnumber_t agno;
>> + struct xfs_perag *pag;
>> +
>> + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
>> + pag = xfs_perag_get(mp, agno);
>> + /* The first extent in EFI is always OK to go */
>> + if (xfs_is_first_extent_in_efi(efip, xefi)) {
>> + atomic_inc(&pag->pag_nr_pending_extents);
>> + ret = false;
>> + goto out_put;
>> + }
>> +
>> + /*
>> +  * Now the extent is the 2nd or subsequent in the efip. We need
>> +  * new transaction if the AG already has busy extents pending.
>> +  */
>> + nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
>> + /* No prevoius pending extent freeing to this AG */
>> + if (nr_pre == 0) {
>> + ret = false;
>> + goto out_put;
>> + }
>> +
>> + atomic_dec(&pag->pag_nr_pending_extents);
>> +out_put:
>> + xfs_perag_put(pag);
>> + return ret;
>> +}
>> +
>> /*
>>  * Free an extent and log it to the EFD. Note that the transaction is marked
>>  * dirty regardless of whether the extent free succeeds or fails to support the
>> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
>> xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
>> xefi->xefi_startblock);
>> int error;
>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
>> +
>> + if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
>> + /*
>> +  * This should be the 2nd+ extent, we don't have to mark the
>> +  * transaction and efd dirty, those are already done with the
>> +  * first extent.
>> +  */
>> + ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
>> + ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
>> + ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
>> +
>> + xfs_fill_efd_with_efi(efdp);
>> +
>> + /*
>> +  * A preious extent in same AG is processed with the current
>> +  * transaction. To avoid possible AGFL allocation deadlock,
>> +  * we roll the transaction and then restart with this extent
>> +  * with new transaction.
>> +  */
>> + return -EAGAIN;
>> + }
>> 
>> oinfo.oi_owner = xefi->xefi_owner;
>> if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
>> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
>> error = __xfs_free_extent(tp, xefi->xefi_startblock,
>> xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
>> xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
>> + if (error) {
>> + struct xfs_perag *pag;
>> +
>> + pag = xfs_perag_get(mp, agno);
>> + atomic_dec(&pag->pag_nr_pending_extents);
>> + xfs_perag_put(pag);
>> + }
>> /*
>>  * Mark the transaction dirty, even on error. This ensures the
>>  * transaction is aborted, which:
>> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
> 
> This function comes with an unused **state variable:
> 

I don’t see why we need to take different action according to
the ‘state’.  

> STATIC int
> xfs_extent_free_finish_item(
> struct xfs_trans *tp,
> struct xfs_log_item *done,
> struct list_head *item,
> struct xfs_btree_cur **state)
> 
> What if, after each xfs_trans_free_extent call, we stuff *state with
> xefi_startblock's AG?
> 

see Above.

> Then, upon entry to xfs_extent_free_finish_item, we compare *state with
> the xefi item and return EAGAIN if we're processing an EFI from the same
> AG as the previous call to xfs_extent_free_finish_item?  Something like
> this (totally untested):
> 
> STATIC int
> xfs_extent_free_finish_item(
> struct xfs_trans *tp,
> struct xfs_log_item *done,
> struct list_head *item,
> struct xfs_btree_cur **state)
> {
> struct xfs_extent_free_item *xefi;
> int error;
> 
> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> 
> if (*state) {
> struct xfs_perag *oldpag = *(struct xfs_perag **)state;
> 
> /*
>  * If we're freeing two extents from the same AG, we
>  * must roll the transaction between the two extents to
>  * avoid a livelock resulting from AGFL fixing waiting
>  * on the extent that we just freed to become unbusy.
>  */
> if (oldpag == xefi->xefi_pag) {
> xfs_fill_efd_with_efi(EFD_ITEM(done));
> xfs_perag_put(oldpag);
> return -EAGAIN;
> }

As I said the first record would let go, we don’t care previous AG.

> xfs_perag_put(oldpag);
> }
> 
> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> if (!error)
> *state = (void *)xfs_perag_hold(xefi->xefi_pag);
> 
> xfs_extent_free_put_group(xefi);
> kmem_cache_free(xfs_extfree_item_cache, xefi);
> return error;
> }
> 
> Obviously, you now need a ->finish_cleanup function to drop the perag
> reference that may be stashed in @state.  This (I think) avoids the
> livelock by ensuring that xfs_trans_free_extent always starts with a
> transaction that does not contain any busy extents from the same AG that
> it's acting on, right?

Nope, we don’t need to pass perag in state. Still the rule:

The first record in an EFI (no matter it’s the original EFI or new EFIs) just
goes (because it won’t hit the deadlock).

For the 2nd or subsequent records, we need to see if there are any pending
busy_extent to the same AG as the record belongs to. “pending” means
those are still in xfs_trans->t_busy (not committed to xlog_cil_pcp->busy_extents
yet). We know that by checking the newly introduced per AG counter
pag_nr_pending_extents. If there are, we retry the record in a new EFI
in a new transaction after committing the original transaction. With in the new
EFI, the 2nd record in the original EFI become the 1st record, and it just can
go (because it won’t hit the deadlock).

We are using per AG counter here to know if there are pending busy extents
rather than comparing with the involved previous AGs in the same EFI to avoid
also the potential ABBA dead lock:
thread 1:  {3/X, 4/Y}
thread 2:  {4/Z, 3/T}
 
> 
> A simpler version of the above would make it so that we always roll
> between EFI records and don't have to manage perag references:
> 
> #define XFS_EFI_STATE_ROLL ((struct xfs_btree_cur *)1)
> STATIC int
> xfs_extent_free_finish_item(
> struct xfs_trans *tp,
> struct xfs_log_item *done,
> struct list_head *item,
> struct xfs_btree_cur **state)
> {
> struct xfs_extent_free_item *xefi;
> int error;
> 
> /*
>  * If we're freeing two extents, roll the transaction between
>  * the two extents to avoid a livelock resulting from AGFL
>  * fixing waiting on the extent that we just freed to become
>  * unbusy.  It's not necessary to roll if the previous and
>  * current EFI record point to different AGs, but this
>  * simplifies the state tracking.
>  */
> if (*state == XFS_EFI_STATE_ROLL) {
> xfs_fill_efd_with_efi(EFD_ITEM(done));
> *state = NULL;
> return -EAGAIN;
> }
> 
> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> 
> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> if (!error)
> *state = XFS_EFI_STATE_ROLL;
> 
> xfs_extent_free_put_group(xefi);
> kmem_cache_free(xfs_extfree_item_cache, xefi);
> return error;
> }
> 

By this way, it looks to me that the original EFI is always splitted into several ones
each including only records.  It’s the same result as we change 
XFS_EFI_MAX_FAST_EXTENTS to 1.  Dave doesn’t like that.

My implantation split them only when necessary as Dave suggested.

> (Did you and Dave already talk about this?)

There many discussion in this two threads:
[PATCH 2/2] xfs: log recovery stage split EFIs with multiple extents
[PATCH 1/2] xfs: IO time one extent per EFI

thanks,
wengang



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-12 19:55   ` Wengang Wang
@ 2023-05-12 21:13     ` Darrick J. Wong
  2023-05-12 22:20       ` Wengang Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Darrick J. Wong @ 2023-05-12 21:13 UTC (permalink / raw)
  To: Wengang Wang; +Cc: linux-xfs@vger.kernel.org

On Fri, May 12, 2023 at 07:55:28PM +0000, Wengang Wang wrote:
> Hi Darrick,
> 
> > On May 12, 2023, at 11:24 AM, Darrick J. Wong <djwong@kernel.org> wrote:
> > 
> > Sorry for the slow response, I thought Dave would've responded by now.
> > 
> > On Mon, Apr 24, 2023 at 03:51:02PM -0700, Wengang Wang wrote:
> >> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
> >> The orignal hehaviour for freeing extents in an EFI is that the extents in
> >> the EFI were processed one by one in the same transaction. When the second
> >> and subsequent extents are being processed, we have produced busy extents for
> >> previous extents. If the second and subsequent extents are from the same AG
> >> as the busy extents are, we have the risk of deadlock when allocating AGFL
> >> blocks. A typical calltrace for the deadlock is like this:
> > 
> > It's been a few weeks, so let me try to remember what this is all about.
> > You have one EFI containing multiple extents to free:
> > 
> > {agno: 3, agbno: X, ... }
> > {agno: 3, agbno: Y, ... }
> > 
> > AG 3 is nearly full, so we free the first extent (3/X), which adds it to
> > the bnobt, records the newly freed extent in the extentbusylist, and
> > attaches the record to the transaction's busy extent list.
> > 
> > Then we move on to the second record in the EFI item.  We want to free
> > (3/Y), but whoops the AGFL isn't big enough to handle maximal expansion
> > of the bnobt/cntbt btrees.
> > 
> > So we try to fix the AG 3 AGFL to be long enough.  We can find the space
> > in the bnobt, but the only space available is the (3/X) that we put
> > there during the last step.  That space is still(!) busy and still
> > attached to the transaction, so the CIL cannot clear it.  The AGFL fixer
> > sees that the space is busy, so it waits for it... and now we're dead in
> > the water.
> > 
> > The key to this deadlock is a thread waiting on its own uncommitted busy
> > extent, right?
> > 
> 
> Yep, above is all correct.
> 
> >> 
> >> #0 context_switch() kernel/sched/core.c:3881
> >> #1 __schedule() kernel/sched/core.c:5111
> >> #2 schedule() kernel/sched/core.c:5186
> >> #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
> >> #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
> >> #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
> >> #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
> >> #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
> >> #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
> >> #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
> >> #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
> >> #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
> >> #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
> >> #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
> >> #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764
> >> #15 xfs_mountfs() fs/xfs/xfs_mount.c:978
> >> #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
> >> #17 mount_bdev() fs/super.c:1417
> >> #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985
> >> #19 legacy_get_tree() fs/fs_context.c:647
> >> #20 vfs_get_tree() fs/super.c:1547
> >> #21 do_new_mount() fs/namespace.c:2843
> >> #22 do_mount() fs/namespace.c:3163
> >> #23 ksys_mount() fs/namespace.c:3372
> >> #24 __do_sys_mount() fs/namespace.c:3386
> >> #25 __se_sys_mount() fs/namespace.c:3383
> >> #26 __x64_sys_mount() fs/namespace.c:3383
> >> #27 do_syscall_64() arch/x86/entry/common.c:296
> >> #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
> >> 
> >> The deadlock could happen at both IO time and log recover time.
> >> 
> >> To avoid above deadlock, this patch changes the extent free procedure.
> >> 1) it always let the first extent from the EFI go (no change).
> >> 2) increase the (new) AG counter when it let a extent go.
> >> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
> >>   don't let the extent go with current transaction. Instead, move the
> >>   extent in question and subsequent extents to a new EFI and try the new
> >>   EFI again with new transaction (by rolling current transaction).
> >> 4) for the EFD to orginal EFI, fill it with all the extents from the original
> >>   EFI.
> >> 5) though the new EFI is placed after original EFD, it's safe as they are in
> >>   same in-memory transaction.
> >> 6) The new AG counter for pending extent freeings is decremented after the
> >>   log items in in-memory transaction is committed to CIL.
> > 
> > So the solution you're proposing is a per-AG counter of however many
> > threads have started an EFI free.  If the second (or subsequent) EFI
> > free encounters an AG with a nonzero counter, it'll return EAGAIN to
> > cycle the transaction.  The cycling is critical to getting the busy
> > extent to the CIL so the log can process it and make it unbusy.  If the
> > AG wasn't contended (pag_nr_pending_extents was 0), it proceeds with the
> > freeing, having bumped the per-AG counter.  Right?
> 
> Right.
> 
> > 
> > Therefore, the above deadlock sequence now becomes:
> > 
> > 1. Free (3/X), placing (3/X) on the transaction's busy list.
> > 
> > 2. Start trying to free (3/Y), notice that AG 3 has elevated
> > pag_nr_pending_extents, return EAGAIN
> > 
> > 3. Roll transaction, which moves (3/X) to the CIL and gets us a fresh
> > transaction
> > 
> > 4. Try to free (3/Y) again
> > 
> > At step 4, if pag_nr_pending_extents is still elevated, does that
> > mean we return EAGAIN and keep rolling the transaction until it's not
> > elevated?  Shouldn't we be able to proceed at step 4, even if we end up
> > waiting on the log to clear (3/X) from the busy list?
> 
> The pag_nr_pending_extents is expected to be decremented in step 3.

OH.  I missed (and you pointed out below) the subtlety that step 3 gives
us a fresh transaction /and/ a fresh EFI, so after the roll, (3/Y) is
the "first" EFI and never has to wait.

> The changes in xlog_cil_insert_items() is supposed to do so.  So at
> step 4, for this cass {3/x, 3/y}, pag_nr_pending_extents should be
> decremented to zero, thus step 4 can go (after that busy extent 3/x is
> cleared).

So even if the log is being slow and the CIL hasn't cleared the busy
extent (3/X) by the time we finish rolling the tx and relogging the EFI,
the relogged EFI will start with (3/Y) and move forward without waiting.
Got it.

> > 
> > What happens if the log actually clears (3/X) from the busy list after
> > step 3 but then some other thread starts processing an EFI for (3/Z)?
> 
> There is a rule in the patch that the first record in an EFI can always go (bumping
> the per AG counter). That’s because the freeing the first record never has
> the deadlock issue.
> 
> The 3/Y would go because now the 3/Y is the first record of the new EFI
> containing only record of 3/Y.

Oh, ok.  I missed that.

> For the other thread for 3/Z,  3/Z can go too because it’s the first record in
> that EFI (bumping the per AG counter, now the counter could be 3 at most).

<nod>

> 
> > Does that cause this thread to roll repeatedly waiting for another
> > thread's EFI to clear?
> 
> Nope. As I said, the first record can always go. For the 2nd and subsequent
> record, on retry, they would become the first record in the new FEI and it goes.

<nod>

> > 
> > IOWs, I'm not sure we can always make forward progress with this scheme,
> > and I'd prefer to avoid introducing more global state.
> 
> See above explanation.
> 
> > 
> >> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> >> ---
> >> fs/xfs/libxfs/xfs_ag.c    |   1 +
> >> fs/xfs/libxfs/xfs_ag.h    |   5 ++
> >> fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
> >> fs/xfs/xfs_log_cil.c      |  24 ++++++++-
> >> 4 files changed, 138 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> >> index 86696a1c6891..61ef61e05668 100644
> >> --- a/fs/xfs/libxfs/xfs_ag.c
> >> +++ b/fs/xfs/libxfs/xfs_ag.c
> >> @@ -378,6 +378,7 @@ xfs_initialize_perag(
> >> pag->pagb_tree = RB_ROOT;
> >> #endif /* __KERNEL__ */
> >> 
> >> + atomic_set(&pag->pag_nr_pending_extents, 0);
> >> error = xfs_buf_hash_init(pag);
> >> if (error)
> >> goto out_remove_pag;
> >> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> >> index 5e18536dfdce..5950bc36a32c 100644
> >> --- a/fs/xfs/libxfs/xfs_ag.h
> >> +++ b/fs/xfs/libxfs/xfs_ag.h
> >> @@ -82,6 +82,11 @@ struct xfs_perag {
> >> uint16_t pag_sick;
> >> spinlock_t pag_state_lock;
> >> 
> >> + /*
> >> +  * Number of concurrent extent freeings (not committed to CIL yet)
> >> +  * on this AG.
> >> +  */
> >> + atomic_t pag_nr_pending_extents;
> >> spinlock_t pagb_lock; /* lock for pagb_tree */
> >> struct rb_root pagb_tree; /* ordered tree of busy extents */
> >> unsigned int pagb_gen; /* generation count for pagb_tree */
> >> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> >> index 011b50469301..1dbf36d9c1c9 100644
> >> --- a/fs/xfs/xfs_extfree_item.c
> >> +++ b/fs/xfs/xfs_extfree_item.c
> >> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
> >> return efdp;
> >> }
> >> 
> >> +/*
> >> + * Fill the EFD with all extents from the EFI and set the counter.
> >> + * Note: the EFD should comtain at least one extents already.
> >> + */
> >> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
> >> +{
> >> + struct xfs_efi_log_item *efip = efdp->efd_efip;
> >> + uint i;
> >> +
> >> + i = efdp->efd_next_extent;
> >> + ASSERT(i > 0);
> >> + for (; i < efip->efi_format.efi_nextents; i++) {
> >> + efdp->efd_format.efd_extents[i] =
> >> + efip->efi_format.efi_extents[i];
> > 
> > Why is it necessary to fill the EFD mapping structures?  Nobody /ever/
> > looks at those; the only part that matters to log recovery is matching
> > efd.efd_efi_id to efi.efi_id.
> 
> Yes, that’s for EFI/EFD matching for log recovery.
> The records to be retried would be in a new EFI.
> 
> > 
> > But, I guess it looks funny to requeue an EFI and have the EFD for the
> > old EFI missing a bunch of fields.
> 
> The EFD to the original EFI is not missing anything, it is expected
> to be exact the same as if all the records are processed.
> By this way we move records to new EFI(s) to avoid the deadlock.
> 
> > 
> >> + }
> >> + efdp->efd_next_extent = i;
> >> +}
> >> +
> >> +/*
> >> + * Check if xefi is the first in the efip.
> >> + * Returns true if so, ad false otherwise
> >> + */
> >> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
> >> +   struct xfs_extent_free_item *xefi)
> >> +{
> >> + return efip->efi_format.efi_extents[0].ext_start ==
> >> + xefi->xefi_startblock;
> >> +}
> >> +
> >> +/*
> >> + * Check if the xefi needs to be in a new transaction.
> >> + * efip is the containing EFI of xefi.
> >> + * Return true if so, false otherwise.
> >> + */
> >> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
> >> +     struct xfs_efi_log_item *efip,
> >> +     struct xfs_extent_free_item *xefi)
> >> +{
> >> + bool ret = true;
> >> + int nr_pre;
> >> + xfs_agnumber_t agno;
> >> + struct xfs_perag *pag;
> >> +
> >> + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
> >> + pag = xfs_perag_get(mp, agno);
> >> + /* The first extent in EFI is always OK to go */
> >> + if (xfs_is_first_extent_in_efi(efip, xefi)) {
> >> + atomic_inc(&pag->pag_nr_pending_extents);
> >> + ret = false;
> >> + goto out_put;
> >> + }
> >> +
> >> + /*
> >> +  * Now the extent is the 2nd or subsequent in the efip. We need
> >> +  * new transaction if the AG already has busy extents pending.
> >> +  */
> >> + nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
> >> + /* No prevoius pending extent freeing to this AG */
> >> + if (nr_pre == 0) {
> >> + ret = false;
> >> + goto out_put;
> >> + }
> >> +
> >> + atomic_dec(&pag->pag_nr_pending_extents);
> >> +out_put:
> >> + xfs_perag_put(pag);
> >> + return ret;
> >> +}
> >> +
> >> /*
> >>  * Free an extent and log it to the EFD. Note that the transaction is marked
> >>  * dirty regardless of whether the extent free succeeds or fails to support the
> >> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
> >> xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
> >> xefi->xefi_startblock);
> >> int error;
> >> + struct xfs_efi_log_item *efip = efdp->efd_efip;
> >> +
> >> + if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
> >> + /*
> >> +  * This should be the 2nd+ extent, we don't have to mark the
> >> +  * transaction and efd dirty, those are already done with the
> >> +  * first extent.
> >> +  */
> >> + ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
> >> + ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
> >> + ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
> >> +
> >> + xfs_fill_efd_with_efi(efdp);
> >> +
> >> + /*
> >> +  * A preious extent in same AG is processed with the current
> >> +  * transaction. To avoid possible AGFL allocation deadlock,
> >> +  * we roll the transaction and then restart with this extent
> >> +  * with new transaction.
> >> +  */
> >> + return -EAGAIN;
> >> + }
> >> 
> >> oinfo.oi_owner = xefi->xefi_owner;
> >> if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
> >> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
> >> error = __xfs_free_extent(tp, xefi->xefi_startblock,
> >> xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
> >> xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
> >> + if (error) {
> >> + struct xfs_perag *pag;
> >> +
> >> + pag = xfs_perag_get(mp, agno);
> >> + atomic_dec(&pag->pag_nr_pending_extents);
> >> + xfs_perag_put(pag);
> >> + }
> >> /*
> >>  * Mark the transaction dirty, even on error. This ensures the
> >>  * transaction is aborted, which:
> >> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
> > 
> > This function comes with an unused **state variable:
> > 
> 
> I don’t see why we need to take different action according to
> the ‘state’.  
> 
> > STATIC int
> > xfs_extent_free_finish_item(
> > struct xfs_trans *tp,
> > struct xfs_log_item *done,
> > struct list_head *item,
> > struct xfs_btree_cur **state)
> > 
> > What if, after each xfs_trans_free_extent call, we stuff *state with
> > xefi_startblock's AG?
> > 
> 
> see Above.
> 
> > Then, upon entry to xfs_extent_free_finish_item, we compare *state with
> > the xefi item and return EAGAIN if we're processing an EFI from the same
> > AG as the previous call to xfs_extent_free_finish_item?  Something like
> > this (totally untested):
> > 
> > STATIC int
> > xfs_extent_free_finish_item(
> > struct xfs_trans *tp,
> > struct xfs_log_item *done,
> > struct list_head *item,
> > struct xfs_btree_cur **state)
> > {
> > struct xfs_extent_free_item *xefi;
> > int error;
> > 
> > xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> > 
> > if (*state) {
> > struct xfs_perag *oldpag = *(struct xfs_perag **)state;
> > 
> > /*
> >  * If we're freeing two extents from the same AG, we
> >  * must roll the transaction between the two extents to
> >  * avoid a livelock resulting from AGFL fixing waiting
> >  * on the extent that we just freed to become unbusy.
> >  */
> > if (oldpag == xefi->xefi_pag) {
> > xfs_fill_efd_with_efi(EFD_ITEM(done));
> > xfs_perag_put(oldpag);
> > return -EAGAIN;
> > }
> 
> As I said the first record would let go, we don’t care previous AG.
> 
> > xfs_perag_put(oldpag);
> > }
> > 
> > error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> > if (!error)
> > *state = (void *)xfs_perag_hold(xefi->xefi_pag);
> > 
> > xfs_extent_free_put_group(xefi);
> > kmem_cache_free(xfs_extfree_item_cache, xefi);
> > return error;
> > }
> > 
> > Obviously, you now need a ->finish_cleanup function to drop the perag
> > reference that may be stashed in @state.  This (I think) avoids the
> > livelock by ensuring that xfs_trans_free_extent always starts with a
> > transaction that does not contain any busy extents from the same AG that
> > it's acting on, right?
> 
> Nope, we don’t need to pass perag in state. Still the rule:
> 
> The first record in an EFI (no matter it’s the original EFI or new EFIs) just
> goes (because it won’t hit the deadlock).
> 
> For the 2nd or subsequent records, we need to see if there are any pending
> busy_extent to the same AG as the record belongs to. “pending” means
> those are still in xfs_trans->t_busy (not committed to xlog_cil_pcp->busy_extents
> yet). We know that by checking the newly introduced per AG counter
> pag_nr_pending_extents. If there are, we retry the record in a new EFI
> in a new transaction after committing the original transaction. With in the new
> EFI, the 2nd record in the original EFI become the 1st record, and it just can
> go (because it won’t hit the deadlock).

<nod> So in other words, the pag_nr_pending_extents counter buys us the
performance advantage that we don't need to add the extra roll between
(3/X) and (3/Y) if the log is running fast enough to clear the busy
extent before we even start evaluating the relogged EFI (with 3/Y in
it).

How often does it happen that we have to roll the transaction and relog
the EFI?

> We are using per AG counter here to know if there are pending busy extents
> rather than comparing with the involved previous AGs in the same EFI to avoid
> also the potential ABBA dead lock:
> thread 1:  {3/X, 4/Y}
> thread 2:  {4/Z, 3/T}
>  
> > 
> > A simpler version of the above would make it so that we always roll
> > between EFI records and don't have to manage perag references:
> > 
> > #define XFS_EFI_STATE_ROLL ((struct xfs_btree_cur *)1)
> > STATIC int
> > xfs_extent_free_finish_item(
> > struct xfs_trans *tp,
> > struct xfs_log_item *done,
> > struct list_head *item,
> > struct xfs_btree_cur **state)
> > {
> > struct xfs_extent_free_item *xefi;
> > int error;
> > 
> > /*
> >  * If we're freeing two extents, roll the transaction between
> >  * the two extents to avoid a livelock resulting from AGFL
> >  * fixing waiting on the extent that we just freed to become
> >  * unbusy.  It's not necessary to roll if the previous and
> >  * current EFI record point to different AGs, but this
> >  * simplifies the state tracking.
> >  */
> > if (*state == XFS_EFI_STATE_ROLL) {
> > xfs_fill_efd_with_efi(EFD_ITEM(done));
> > *state = NULL;
> > return -EAGAIN;
> > }
> > 
> > xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> > 
> > error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> > if (!error)
> > *state = XFS_EFI_STATE_ROLL;
> > 
> > xfs_extent_free_put_group(xefi);
> > kmem_cache_free(xfs_extfree_item_cache, xefi);
> > return error;
> > }
> > 
> 
> By this way, it looks to me that the original EFI is always splitted into several ones
> each including only records.  It’s the same result as we change 
> XFS_EFI_MAX_FAST_EXTENTS to 1.  Dave doesn’t like that.

Yes.

> My implantation split them only when necessary as Dave suggested.
> 
> > (Did you and Dave already talk about this?)
> 
> There many discussion in this two threads:
> [PATCH 2/2] xfs: log recovery stage split EFIs with multiple extents
> [PATCH 1/2] xfs: IO time one extent per EFI

Ok, I'll go reread all that.

--D

> thanks,
> wengang
> 
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-12 21:13     ` Darrick J. Wong
@ 2023-05-12 22:20       ` Wengang Wang
  2023-05-16 16:05         ` Wengang Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Wengang Wang @ 2023-05-12 22:20 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs@vger.kernel.org



> On May 12, 2023, at 2:13 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> On Fri, May 12, 2023 at 07:55:28PM +0000, Wengang Wang wrote:
>> Hi Darrick,
>> 
>>> On May 12, 2023, at 11:24 AM, Darrick J. Wong <djwong@kernel.org> wrote:
>>> 
>>> Sorry for the slow response, I thought Dave would've responded by now.
>>> 
>>> On Mon, Apr 24, 2023 at 03:51:02PM -0700, Wengang Wang wrote:
>>>> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
>>>> The orignal hehaviour for freeing extents in an EFI is that the extents in
>>>> the EFI were processed one by one in the same transaction. When the second
>>>> and subsequent extents are being processed, we have produced busy extents for
>>>> previous extents. If the second and subsequent extents are from the same AG
>>>> as the busy extents are, we have the risk of deadlock when allocating AGFL
>>>> blocks. A typical calltrace for the deadlock is like this:
>>> 
>>> It's been a few weeks, so let me try to remember what this is all about.
>>> You have one EFI containing multiple extents to free:
>>> 
>>> {agno: 3, agbno: X, ... }
>>> {agno: 3, agbno: Y, ... }
>>> 
>>> AG 3 is nearly full, so we free the first extent (3/X), which adds it to
>>> the bnobt, records the newly freed extent in the extentbusylist, and
>>> attaches the record to the transaction's busy extent list.
>>> 
>>> Then we move on to the second record in the EFI item.  We want to free
>>> (3/Y), but whoops the AGFL isn't big enough to handle maximal expansion
>>> of the bnobt/cntbt btrees.
>>> 
>>> So we try to fix the AG 3 AGFL to be long enough.  We can find the space
>>> in the bnobt, but the only space available is the (3/X) that we put
>>> there during the last step.  That space is still(!) busy and still
>>> attached to the transaction, so the CIL cannot clear it.  The AGFL fixer
>>> sees that the space is busy, so it waits for it... and now we're dead in
>>> the water.
>>> 
>>> The key to this deadlock is a thread waiting on its own uncommitted busy
>>> extent, right?
>>> 
>> 
>> Yep, above is all correct.
>> 
>>>> 
>>>> #0 context_switch() kernel/sched/core.c:3881
>>>> #1 __schedule() kernel/sched/core.c:5111
>>>> #2 schedule() kernel/sched/core.c:5186
>>>> #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
>>>> #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
>>>> #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
>>>> #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
>>>> #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
>>>> #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
>>>> #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
>>>> #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
>>>> #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
>>>> #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
>>>> #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
>>>> #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764
>>>> #15 xfs_mountfs() fs/xfs/xfs_mount.c:978
>>>> #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
>>>> #17 mount_bdev() fs/super.c:1417
>>>> #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985
>>>> #19 legacy_get_tree() fs/fs_context.c:647
>>>> #20 vfs_get_tree() fs/super.c:1547
>>>> #21 do_new_mount() fs/namespace.c:2843
>>>> #22 do_mount() fs/namespace.c:3163
>>>> #23 ksys_mount() fs/namespace.c:3372
>>>> #24 __do_sys_mount() fs/namespace.c:3386
>>>> #25 __se_sys_mount() fs/namespace.c:3383
>>>> #26 __x64_sys_mount() fs/namespace.c:3383
>>>> #27 do_syscall_64() arch/x86/entry/common.c:296
>>>> #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
>>>> 
>>>> The deadlock could happen at both IO time and log recover time.
>>>> 
>>>> To avoid above deadlock, this patch changes the extent free procedure.
>>>> 1) it always let the first extent from the EFI go (no change).
>>>> 2) increase the (new) AG counter when it let a extent go.
>>>> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
>>>>  don't let the extent go with current transaction. Instead, move the
>>>>  extent in question and subsequent extents to a new EFI and try the new
>>>>  EFI again with new transaction (by rolling current transaction).
>>>> 4) for the EFD to orginal EFI, fill it with all the extents from the original
>>>>  EFI.
>>>> 5) though the new EFI is placed after original EFD, it's safe as they are in
>>>>  same in-memory transaction.
>>>> 6) The new AG counter for pending extent freeings is decremented after the
>>>>  log items in in-memory transaction is committed to CIL.
>>> 
>>> So the solution you're proposing is a per-AG counter of however many
>>> threads have started an EFI free.  If the second (or subsequent) EFI
>>> free encounters an AG with a nonzero counter, it'll return EAGAIN to
>>> cycle the transaction.  The cycling is critical to getting the busy
>>> extent to the CIL so the log can process it and make it unbusy.  If the
>>> AG wasn't contended (pag_nr_pending_extents was 0), it proceeds with the
>>> freeing, having bumped the per-AG counter.  Right?
>> 
>> Right.
>> 
>>> 
>>> Therefore, the above deadlock sequence now becomes:
>>> 
>>> 1. Free (3/X), placing (3/X) on the transaction's busy list.
>>> 
>>> 2. Start trying to free (3/Y), notice that AG 3 has elevated
>>> pag_nr_pending_extents, return EAGAIN
>>> 
>>> 3. Roll transaction, which moves (3/X) to the CIL and gets us a fresh
>>> transaction
>>> 
>>> 4. Try to free (3/Y) again
>>> 
>>> At step 4, if pag_nr_pending_extents is still elevated, does that
>>> mean we return EAGAIN and keep rolling the transaction until it's not
>>> elevated?  Shouldn't we be able to proceed at step 4, even if we end up
>>> waiting on the log to clear (3/X) from the busy list?
>> 
>> The pag_nr_pending_extents is expected to be decremented in step 3.
> 
> OH.  I missed (and you pointed out below) the subtlety that step 3 gives
> us a fresh transaction /and/ a fresh EFI, so after the roll, (3/Y) is
> the "first" EFI and never has to wait.

Yes exactly.

> 
>> The changes in xlog_cil_insert_items() is supposed to do so.  So at
>> step 4, for this cass {3/x, 3/y}, pag_nr_pending_extents should be
>> decremented to zero, thus step 4 can go (after that busy extent 3/x is
>> cleared).
> 
> So even if the log is being slow and the CIL hasn't cleared the busy
> extent (3/X) by the time we finish rolling the tx and relogging the EFI,
> the relogged EFI will start with (3/Y) and move forward without waiting.
> Got it.

Correct. There is no waiting while we start to process the EFI records,
The point here is that we process the records in the existing transaction or
in new transactions (rolling current one).

> 
>>> 
>>> What happens if the log actually clears (3/X) from the busy list after
>>> step 3 but then some other thread starts processing an EFI for (3/Z)?
>> 
>> There is a rule in the patch that the first record in an EFI can always go (bumping
>> the per AG counter). That’s because the freeing the first record never has
>> the deadlock issue.
>> 
>> The 3/Y would go because now the 3/Y is the first record of the new EFI
>> containing only record of 3/Y.
> 
> Oh, ok.  I missed that.
> 
>> For the other thread for 3/Z,  3/Z can go too because it’s the first record in
>> that EFI (bumping the per AG counter, now the counter could be 3 at most).
> 
> <nod>
> 
>> 
>>> Does that cause this thread to roll repeatedly waiting for another
>>> thread's EFI to clear?
>> 
>> Nope. As I said, the first record can always go. For the 2nd and subsequent
>> record, on retry, they would become the first record in the new FEI and it goes.
> 
> <nod>
> 
>>> 
>>> IOWs, I'm not sure we can always make forward progress with this scheme,
>>> and I'd prefer to avoid introducing more global state.
>> 
>> See above explanation.
>> 
>>> 
>>>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>>>> ---
>>>> fs/xfs/libxfs/xfs_ag.c    |   1 +
>>>> fs/xfs/libxfs/xfs_ag.h    |   5 ++
>>>> fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
>>>> fs/xfs/xfs_log_cil.c      |  24 ++++++++-
>>>> 4 files changed, 138 insertions(+), 3 deletions(-)
>>>> 
>>>> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
>>>> index 86696a1c6891..61ef61e05668 100644
>>>> --- a/fs/xfs/libxfs/xfs_ag.c
>>>> +++ b/fs/xfs/libxfs/xfs_ag.c
>>>> @@ -378,6 +378,7 @@ xfs_initialize_perag(
>>>> pag->pagb_tree = RB_ROOT;
>>>> #endif /* __KERNEL__ */
>>>> 
>>>> + atomic_set(&pag->pag_nr_pending_extents, 0);
>>>> error = xfs_buf_hash_init(pag);
>>>> if (error)
>>>> goto out_remove_pag;
>>>> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
>>>> index 5e18536dfdce..5950bc36a32c 100644
>>>> --- a/fs/xfs/libxfs/xfs_ag.h
>>>> +++ b/fs/xfs/libxfs/xfs_ag.h
>>>> @@ -82,6 +82,11 @@ struct xfs_perag {
>>>> uint16_t pag_sick;
>>>> spinlock_t pag_state_lock;
>>>> 
>>>> + /*
>>>> +  * Number of concurrent extent freeings (not committed to CIL yet)
>>>> +  * on this AG.
>>>> +  */
>>>> + atomic_t pag_nr_pending_extents;
>>>> spinlock_t pagb_lock; /* lock for pagb_tree */
>>>> struct rb_root pagb_tree; /* ordered tree of busy extents */
>>>> unsigned int pagb_gen; /* generation count for pagb_tree */
>>>> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
>>>> index 011b50469301..1dbf36d9c1c9 100644
>>>> --- a/fs/xfs/xfs_extfree_item.c
>>>> +++ b/fs/xfs/xfs_extfree_item.c
>>>> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
>>>> return efdp;
>>>> }
>>>> 
>>>> +/*
>>>> + * Fill the EFD with all extents from the EFI and set the counter.
>>>> + * Note: the EFD should comtain at least one extents already.
>>>> + */
>>>> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
>>>> +{
>>>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
>>>> + uint i;
>>>> +
>>>> + i = efdp->efd_next_extent;
>>>> + ASSERT(i > 0);
>>>> + for (; i < efip->efi_format.efi_nextents; i++) {
>>>> + efdp->efd_format.efd_extents[i] =
>>>> + efip->efi_format.efi_extents[i];
>>> 
>>> Why is it necessary to fill the EFD mapping structures?  Nobody /ever/
>>> looks at those; the only part that matters to log recovery is matching
>>> efd.efd_efi_id to efi.efi_id.
>> 
>> Yes, that’s for EFI/EFD matching for log recovery.
>> The records to be retried would be in a new EFI.
>> 
>>> 
>>> But, I guess it looks funny to requeue an EFI and have the EFD for the
>>> old EFI missing a bunch of fields.
>> 
>> The EFD to the original EFI is not missing anything, it is expected
>> to be exact the same as if all the records are processed.
>> By this way we move records to new EFI(s) to avoid the deadlock.
>> 
>>> 
>>>> + }
>>>> + efdp->efd_next_extent = i;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Check if xefi is the first in the efip.
>>>> + * Returns true if so, ad false otherwise
>>>> + */
>>>> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
>>>> +   struct xfs_extent_free_item *xefi)
>>>> +{
>>>> + return efip->efi_format.efi_extents[0].ext_start ==
>>>> + xefi->xefi_startblock;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Check if the xefi needs to be in a new transaction.
>>>> + * efip is the containing EFI of xefi.
>>>> + * Return true if so, false otherwise.
>>>> + */
>>>> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
>>>> +     struct xfs_efi_log_item *efip,
>>>> +     struct xfs_extent_free_item *xefi)
>>>> +{
>>>> + bool ret = true;
>>>> + int nr_pre;
>>>> + xfs_agnumber_t agno;
>>>> + struct xfs_perag *pag;
>>>> +
>>>> + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
>>>> + pag = xfs_perag_get(mp, agno);
>>>> + /* The first extent in EFI is always OK to go */
>>>> + if (xfs_is_first_extent_in_efi(efip, xefi)) {
>>>> + atomic_inc(&pag->pag_nr_pending_extents);
>>>> + ret = false;
>>>> + goto out_put;
>>>> + }
>>>> +
>>>> + /*
>>>> +  * Now the extent is the 2nd or subsequent in the efip. We need
>>>> +  * new transaction if the AG already has busy extents pending.
>>>> +  */
>>>> + nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
>>>> + /* No prevoius pending extent freeing to this AG */
>>>> + if (nr_pre == 0) {
>>>> + ret = false;
>>>> + goto out_put;
>>>> + }
>>>> +
>>>> + atomic_dec(&pag->pag_nr_pending_extents);
>>>> +out_put:
>>>> + xfs_perag_put(pag);
>>>> + return ret;
>>>> +}
>>>> +
>>>> /*
>>>> * Free an extent and log it to the EFD. Note that the transaction is marked
>>>> * dirty regardless of whether the extent free succeeds or fails to support the
>>>> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
>>>> xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
>>>> xefi->xefi_startblock);
>>>> int error;
>>>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
>>>> +
>>>> + if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
>>>> + /*
>>>> +  * This should be the 2nd+ extent, we don't have to mark the
>>>> +  * transaction and efd dirty, those are already done with the
>>>> +  * first extent.
>>>> +  */
>>>> + ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
>>>> + ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
>>>> + ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
>>>> +
>>>> + xfs_fill_efd_with_efi(efdp);
>>>> +
>>>> + /*
>>>> +  * A preious extent in same AG is processed with the current
>>>> +  * transaction. To avoid possible AGFL allocation deadlock,
>>>> +  * we roll the transaction and then restart with this extent
>>>> +  * with new transaction.
>>>> +  */
>>>> + return -EAGAIN;
>>>> + }
>>>> 
>>>> oinfo.oi_owner = xefi->xefi_owner;
>>>> if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
>>>> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
>>>> error = __xfs_free_extent(tp, xefi->xefi_startblock,
>>>> xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
>>>> xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
>>>> + if (error) {
>>>> + struct xfs_perag *pag;
>>>> +
>>>> + pag = xfs_perag_get(mp, agno);
>>>> + atomic_dec(&pag->pag_nr_pending_extents);
>>>> + xfs_perag_put(pag);
>>>> + }
>>>> /*
>>>> * Mark the transaction dirty, even on error. This ensures the
>>>> * transaction is aborted, which:
>>>> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
>>> 
>>> This function comes with an unused **state variable:
>>> 
>> 
>> I don’t see why we need to take different action according to
>> the ‘state’.  
>> 
>>> STATIC int
>>> xfs_extent_free_finish_item(
>>> struct xfs_trans *tp,
>>> struct xfs_log_item *done,
>>> struct list_head *item,
>>> struct xfs_btree_cur **state)
>>> 
>>> What if, after each xfs_trans_free_extent call, we stuff *state with
>>> xefi_startblock's AG?
>>> 
>> 
>> see Above.
>> 
>>> Then, upon entry to xfs_extent_free_finish_item, we compare *state with
>>> the xefi item and return EAGAIN if we're processing an EFI from the same
>>> AG as the previous call to xfs_extent_free_finish_item?  Something like
>>> this (totally untested):
>>> 
>>> STATIC int
>>> xfs_extent_free_finish_item(
>>> struct xfs_trans *tp,
>>> struct xfs_log_item *done,
>>> struct list_head *item,
>>> struct xfs_btree_cur **state)
>>> {
>>> struct xfs_extent_free_item *xefi;
>>> int error;
>>> 
>>> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
>>> 
>>> if (*state) {
>>> struct xfs_perag *oldpag = *(struct xfs_perag **)state;
>>> 
>>> /*
>>> * If we're freeing two extents from the same AG, we
>>> * must roll the transaction between the two extents to
>>> * avoid a livelock resulting from AGFL fixing waiting
>>> * on the extent that we just freed to become unbusy.
>>> */
>>> if (oldpag == xefi->xefi_pag) {
>>> xfs_fill_efd_with_efi(EFD_ITEM(done));
>>> xfs_perag_put(oldpag);
>>> return -EAGAIN;
>>> }
>> 
>> As I said the first record would let go, we don’t care previous AG.
>> 
>>> xfs_perag_put(oldpag);
>>> }
>>> 
>>> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
>>> if (!error)
>>> *state = (void *)xfs_perag_hold(xefi->xefi_pag);
>>> 
>>> xfs_extent_free_put_group(xefi);
>>> kmem_cache_free(xfs_extfree_item_cache, xefi);
>>> return error;
>>> }
>>> 
>>> Obviously, you now need a ->finish_cleanup function to drop the perag
>>> reference that may be stashed in @state.  This (I think) avoids the
>>> livelock by ensuring that xfs_trans_free_extent always starts with a
>>> transaction that does not contain any busy extents from the same AG that
>>> it's acting on, right?
>> 
>> Nope, we don’t need to pass perag in state. Still the rule:
>> 
>> The first record in an EFI (no matter it’s the original EFI or new EFIs) just
>> goes (because it won’t hit the deadlock).
>> 
>> For the 2nd or subsequent records, we need to see if there are any pending
>> busy_extent to the same AG as the record belongs to. “pending” means
>> those are still in xfs_trans->t_busy (not committed to xlog_cil_pcp->busy_extents
>> yet). We know that by checking the newly introduced per AG counter
>> pag_nr_pending_extents. If there are, we retry the record in a new EFI
>> in a new transaction after committing the original transaction. With in the new
>> EFI, the 2nd record in the original EFI become the 1st record, and it just can
>> go (because it won’t hit the deadlock).
> 
> <nod> So in other words, the pag_nr_pending_extents counter buys us the
> performance advantage that we don't need to add the extra roll between
> (3/X) and (3/Y) if the log is running fast enough to clear the busy
> extent before we even start evaluating the relogged EFI (with 3/Y in
> it).

I’d think there needs a transaction roll between (3/X) and (3/Y). If we don’t do that
we would have a “pending” busy extent in current xfs_trans->t_busy while processing
(3/Y). And that busy extent had no chance to be cleared because it’s not committed
to xlog_cil_pcp->busy_extents yet and CIL flushing won’t take care of busy extents
which are not in busy_extents. So if no transaction roll in (3/X) and (3/Y), we still
have the risk to the dead lock when allocating blocks for AGFL.

> 
> How often does it happen that we have to roll the transaction and relog
> the EFI?

It could depend on the use case. This may happen when truncating files.


> 
>> We are using per AG counter here to know if there are pending busy extents
>> rather than comparing with the involved previous AGs in the same EFI to avoid
>> also the potential ABBA dead lock:
>> thread 1:  {3/X, 4/Y}
>> thread 2:  {4/Z, 3/T}
>> 
>>> 
>>> A simpler version of the above would make it so that we always roll
>>> between EFI records and don't have to manage perag references:
>>> 
>>> #define XFS_EFI_STATE_ROLL ((struct xfs_btree_cur *)1)
>>> STATIC int
>>> xfs_extent_free_finish_item(
>>> struct xfs_trans *tp,
>>> struct xfs_log_item *done,
>>> struct list_head *item,
>>> struct xfs_btree_cur **state)
>>> {
>>> struct xfs_extent_free_item *xefi;
>>> int error;
>>> 
>>> /*
>>> * If we're freeing two extents, roll the transaction between
>>> * the two extents to avoid a livelock resulting from AGFL
>>> * fixing waiting on the extent that we just freed to become
>>> * unbusy.  It's not necessary to roll if the previous and
>>> * current EFI record point to different AGs, but this
>>> * simplifies the state tracking.
>>> */
>>> if (*state == XFS_EFI_STATE_ROLL) {
>>> xfs_fill_efd_with_efi(EFD_ITEM(done));
>>> *state = NULL;
>>> return -EAGAIN;
>>> }
>>> 
>>> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
>>> 
>>> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
>>> if (!error)
>>> *state = XFS_EFI_STATE_ROLL;
>>> 
>>> xfs_extent_free_put_group(xefi);
>>> kmem_cache_free(xfs_extfree_item_cache, xefi);
>>> return error;
>>> }
>>> 
>> 
>> By this way, it looks to me that the original EFI is always splitted into several ones
>> each including only records.  It’s the same result as we change 
>> XFS_EFI_MAX_FAST_EXTENTS to 1.  Dave doesn’t like that.
> 
> Yes.
> 
>> My implantation split them only when necessary as Dave suggested.
>> 
>>> (Did you and Dave already talk about this?)
>> 
>> There many discussion in this two threads:
>> [PATCH 2/2] xfs: log recovery stage split EFIs with multiple extents
>> [PATCH 1/2] xfs: IO time one extent per EFI
> 
> Ok, I'll go reread all that.

thanks,
wengang


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-12 22:20       ` Wengang Wang
@ 2023-05-16 16:05         ` Wengang Wang
  2023-05-17  0:59           ` Darrick J. Wong
  0 siblings, 1 reply; 16+ messages in thread
From: Wengang Wang @ 2023-05-16 16:05 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs@vger.kernel.org

Hi Darrick, do you have further comments?

thanks,
wengang

> On May 12, 2023, at 3:20 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote:
> 
> 
> 
>> On May 12, 2023, at 2:13 PM, Darrick J. Wong <djwong@kernel.org> wrote:
>> 
>> On Fri, May 12, 2023 at 07:55:28PM +0000, Wengang Wang wrote:
>>> Hi Darrick,
>>> 
>>>> On May 12, 2023, at 11:24 AM, Darrick J. Wong <djwong@kernel.org> wrote:
>>>> 
>>>> Sorry for the slow response, I thought Dave would've responded by now.
>>>> 
>>>> On Mon, Apr 24, 2023 at 03:51:02PM -0700, Wengang Wang wrote:
>>>>> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
>>>>> The orignal hehaviour for freeing extents in an EFI is that the extents in
>>>>> the EFI were processed one by one in the same transaction. When the second
>>>>> and subsequent extents are being processed, we have produced busy extents for
>>>>> previous extents. If the second and subsequent extents are from the same AG
>>>>> as the busy extents are, we have the risk of deadlock when allocating AGFL
>>>>> blocks. A typical calltrace for the deadlock is like this:
>>>> 
>>>> It's been a few weeks, so let me try to remember what this is all about.
>>>> You have one EFI containing multiple extents to free:
>>>> 
>>>> {agno: 3, agbno: X, ... }
>>>> {agno: 3, agbno: Y, ... }
>>>> 
>>>> AG 3 is nearly full, so we free the first extent (3/X), which adds it to
>>>> the bnobt, records the newly freed extent in the extentbusylist, and
>>>> attaches the record to the transaction's busy extent list.
>>>> 
>>>> Then we move on to the second record in the EFI item.  We want to free
>>>> (3/Y), but whoops the AGFL isn't big enough to handle maximal expansion
>>>> of the bnobt/cntbt btrees.
>>>> 
>>>> So we try to fix the AG 3 AGFL to be long enough.  We can find the space
>>>> in the bnobt, but the only space available is the (3/X) that we put
>>>> there during the last step.  That space is still(!) busy and still
>>>> attached to the transaction, so the CIL cannot clear it.  The AGFL fixer
>>>> sees that the space is busy, so it waits for it... and now we're dead in
>>>> the water.
>>>> 
>>>> The key to this deadlock is a thread waiting on its own uncommitted busy
>>>> extent, right?
>>>> 
>>> 
>>> Yep, above is all correct.
>>> 
>>>>> 
>>>>> #0 context_switch() kernel/sched/core.c:3881
>>>>> #1 __schedule() kernel/sched/core.c:5111
>>>>> #2 schedule() kernel/sched/core.c:5186
>>>>> #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
>>>>> #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
>>>>> #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
>>>>> #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
>>>>> #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
>>>>> #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
>>>>> #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
>>>>> #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
>>>>> #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
>>>>> #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
>>>>> #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
>>>>> #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764
>>>>> #15 xfs_mountfs() fs/xfs/xfs_mount.c:978
>>>>> #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
>>>>> #17 mount_bdev() fs/super.c:1417
>>>>> #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985
>>>>> #19 legacy_get_tree() fs/fs_context.c:647
>>>>> #20 vfs_get_tree() fs/super.c:1547
>>>>> #21 do_new_mount() fs/namespace.c:2843
>>>>> #22 do_mount() fs/namespace.c:3163
>>>>> #23 ksys_mount() fs/namespace.c:3372
>>>>> #24 __do_sys_mount() fs/namespace.c:3386
>>>>> #25 __se_sys_mount() fs/namespace.c:3383
>>>>> #26 __x64_sys_mount() fs/namespace.c:3383
>>>>> #27 do_syscall_64() arch/x86/entry/common.c:296
>>>>> #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
>>>>> 
>>>>> The deadlock could happen at both IO time and log recover time.
>>>>> 
>>>>> To avoid above deadlock, this patch changes the extent free procedure.
>>>>> 1) it always let the first extent from the EFI go (no change).
>>>>> 2) increase the (new) AG counter when it let a extent go.
>>>>> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
>>>>> don't let the extent go with current transaction. Instead, move the
>>>>> extent in question and subsequent extents to a new EFI and try the new
>>>>> EFI again with new transaction (by rolling current transaction).
>>>>> 4) for the EFD to orginal EFI, fill it with all the extents from the original
>>>>> EFI.
>>>>> 5) though the new EFI is placed after original EFD, it's safe as they are in
>>>>> same in-memory transaction.
>>>>> 6) The new AG counter for pending extent freeings is decremented after the
>>>>> log items in in-memory transaction is committed to CIL.
>>>> 
>>>> So the solution you're proposing is a per-AG counter of however many
>>>> threads have started an EFI free.  If the second (or subsequent) EFI
>>>> free encounters an AG with a nonzero counter, it'll return EAGAIN to
>>>> cycle the transaction.  The cycling is critical to getting the busy
>>>> extent to the CIL so the log can process it and make it unbusy.  If the
>>>> AG wasn't contended (pag_nr_pending_extents was 0), it proceeds with the
>>>> freeing, having bumped the per-AG counter.  Right?
>>> 
>>> Right.
>>> 
>>>> 
>>>> Therefore, the above deadlock sequence now becomes:
>>>> 
>>>> 1. Free (3/X), placing (3/X) on the transaction's busy list.
>>>> 
>>>> 2. Start trying to free (3/Y), notice that AG 3 has elevated
>>>> pag_nr_pending_extents, return EAGAIN
>>>> 
>>>> 3. Roll transaction, which moves (3/X) to the CIL and gets us a fresh
>>>> transaction
>>>> 
>>>> 4. Try to free (3/Y) again
>>>> 
>>>> At step 4, if pag_nr_pending_extents is still elevated, does that
>>>> mean we return EAGAIN and keep rolling the transaction until it's not
>>>> elevated?  Shouldn't we be able to proceed at step 4, even if we end up
>>>> waiting on the log to clear (3/X) from the busy list?
>>> 
>>> The pag_nr_pending_extents is expected to be decremented in step 3.
>> 
>> OH.  I missed (and you pointed out below) the subtlety that step 3 gives
>> us a fresh transaction /and/ a fresh EFI, so after the roll, (3/Y) is
>> the "first" EFI and never has to wait.
> 
> Yes exactly.
> 
>> 
>>> The changes in xlog_cil_insert_items() is supposed to do so.  So at
>>> step 4, for this cass {3/x, 3/y}, pag_nr_pending_extents should be
>>> decremented to zero, thus step 4 can go (after that busy extent 3/x is
>>> cleared).
>> 
>> So even if the log is being slow and the CIL hasn't cleared the busy
>> extent (3/X) by the time we finish rolling the tx and relogging the EFI,
>> the relogged EFI will start with (3/Y) and move forward without waiting.
>> Got it.
> 
> Correct. There is no waiting while we start to process the EFI records,
> The point here is that we process the records in the existing transaction or
> in new transactions (rolling current one).
> 
>> 
>>>> 
>>>> What happens if the log actually clears (3/X) from the busy list after
>>>> step 3 but then some other thread starts processing an EFI for (3/Z)?
>>> 
>>> There is a rule in the patch that the first record in an EFI can always go (bumping
>>> the per AG counter). That’s because the freeing the first record never has
>>> the deadlock issue.
>>> 
>>> The 3/Y would go because now the 3/Y is the first record of the new EFI
>>> containing only record of 3/Y.
>> 
>> Oh, ok.  I missed that.
>> 
>>> For the other thread for 3/Z,  3/Z can go too because it’s the first record in
>>> that EFI (bumping the per AG counter, now the counter could be 3 at most).
>> 
>> <nod>
>> 
>>> 
>>>> Does that cause this thread to roll repeatedly waiting for another
>>>> thread's EFI to clear?
>>> 
>>> Nope. As I said, the first record can always go. For the 2nd and subsequent
>>> record, on retry, they would become the first record in the new FEI and it goes.
>> 
>> <nod>
>> 
>>>> 
>>>> IOWs, I'm not sure we can always make forward progress with this scheme,
>>>> and I'd prefer to avoid introducing more global state.
>>> 
>>> See above explanation.
>>> 
>>>> 
>>>>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>>>>> ---
>>>>> fs/xfs/libxfs/xfs_ag.c    |   1 +
>>>>> fs/xfs/libxfs/xfs_ag.h    |   5 ++
>>>>> fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
>>>>> fs/xfs/xfs_log_cil.c      |  24 ++++++++-
>>>>> 4 files changed, 138 insertions(+), 3 deletions(-)
>>>>> 
>>>>> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
>>>>> index 86696a1c6891..61ef61e05668 100644
>>>>> --- a/fs/xfs/libxfs/xfs_ag.c
>>>>> +++ b/fs/xfs/libxfs/xfs_ag.c
>>>>> @@ -378,6 +378,7 @@ xfs_initialize_perag(
>>>>> pag->pagb_tree = RB_ROOT;
>>>>> #endif /* __KERNEL__ */
>>>>> 
>>>>> + atomic_set(&pag->pag_nr_pending_extents, 0);
>>>>> error = xfs_buf_hash_init(pag);
>>>>> if (error)
>>>>> goto out_remove_pag;
>>>>> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
>>>>> index 5e18536dfdce..5950bc36a32c 100644
>>>>> --- a/fs/xfs/libxfs/xfs_ag.h
>>>>> +++ b/fs/xfs/libxfs/xfs_ag.h
>>>>> @@ -82,6 +82,11 @@ struct xfs_perag {
>>>>> uint16_t pag_sick;
>>>>> spinlock_t pag_state_lock;
>>>>> 
>>>>> + /*
>>>>> +  * Number of concurrent extent freeings (not committed to CIL yet)
>>>>> +  * on this AG.
>>>>> +  */
>>>>> + atomic_t pag_nr_pending_extents;
>>>>> spinlock_t pagb_lock; /* lock for pagb_tree */
>>>>> struct rb_root pagb_tree; /* ordered tree of busy extents */
>>>>> unsigned int pagb_gen; /* generation count for pagb_tree */
>>>>> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
>>>>> index 011b50469301..1dbf36d9c1c9 100644
>>>>> --- a/fs/xfs/xfs_extfree_item.c
>>>>> +++ b/fs/xfs/xfs_extfree_item.c
>>>>> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
>>>>> return efdp;
>>>>> }
>>>>> 
>>>>> +/*
>>>>> + * Fill the EFD with all extents from the EFI and set the counter.
>>>>> + * Note: the EFD should comtain at least one extents already.
>>>>> + */
>>>>> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
>>>>> +{
>>>>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
>>>>> + uint i;
>>>>> +
>>>>> + i = efdp->efd_next_extent;
>>>>> + ASSERT(i > 0);
>>>>> + for (; i < efip->efi_format.efi_nextents; i++) {
>>>>> + efdp->efd_format.efd_extents[i] =
>>>>> + efip->efi_format.efi_extents[i];
>>>> 
>>>> Why is it necessary to fill the EFD mapping structures?  Nobody /ever/
>>>> looks at those; the only part that matters to log recovery is matching
>>>> efd.efd_efi_id to efi.efi_id.
>>> 
>>> Yes, that’s for EFI/EFD matching for log recovery.
>>> The records to be retried would be in a new EFI.
>>> 
>>>> 
>>>> But, I guess it looks funny to requeue an EFI and have the EFD for the
>>>> old EFI missing a bunch of fields.
>>> 
>>> The EFD to the original EFI is not missing anything, it is expected
>>> to be exact the same as if all the records are processed.
>>> By this way we move records to new EFI(s) to avoid the deadlock.
>>> 
>>>> 
>>>>> + }
>>>>> + efdp->efd_next_extent = i;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Check if xefi is the first in the efip.
>>>>> + * Returns true if so, ad false otherwise
>>>>> + */
>>>>> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
>>>>> +   struct xfs_extent_free_item *xefi)
>>>>> +{
>>>>> + return efip->efi_format.efi_extents[0].ext_start ==
>>>>> + xefi->xefi_startblock;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Check if the xefi needs to be in a new transaction.
>>>>> + * efip is the containing EFI of xefi.
>>>>> + * Return true if so, false otherwise.
>>>>> + */
>>>>> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
>>>>> +     struct xfs_efi_log_item *efip,
>>>>> +     struct xfs_extent_free_item *xefi)
>>>>> +{
>>>>> + bool ret = true;
>>>>> + int nr_pre;
>>>>> + xfs_agnumber_t agno;
>>>>> + struct xfs_perag *pag;
>>>>> +
>>>>> + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
>>>>> + pag = xfs_perag_get(mp, agno);
>>>>> + /* The first extent in EFI is always OK to go */
>>>>> + if (xfs_is_first_extent_in_efi(efip, xefi)) {
>>>>> + atomic_inc(&pag->pag_nr_pending_extents);
>>>>> + ret = false;
>>>>> + goto out_put;
>>>>> + }
>>>>> +
>>>>> + /*
>>>>> +  * Now the extent is the 2nd or subsequent in the efip. We need
>>>>> +  * new transaction if the AG already has busy extents pending.
>>>>> +  */
>>>>> + nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
>>>>> + /* No prevoius pending extent freeing to this AG */
>>>>> + if (nr_pre == 0) {
>>>>> + ret = false;
>>>>> + goto out_put;
>>>>> + }
>>>>> +
>>>>> + atomic_dec(&pag->pag_nr_pending_extents);
>>>>> +out_put:
>>>>> + xfs_perag_put(pag);
>>>>> + return ret;
>>>>> +}
>>>>> +
>>>>> /*
>>>>> * Free an extent and log it to the EFD. Note that the transaction is marked
>>>>> * dirty regardless of whether the extent free succeeds or fails to support the
>>>>> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
>>>>> xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
>>>>> xefi->xefi_startblock);
>>>>> int error;
>>>>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
>>>>> +
>>>>> + if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
>>>>> + /*
>>>>> +  * This should be the 2nd+ extent, we don't have to mark the
>>>>> +  * transaction and efd dirty, those are already done with the
>>>>> +  * first extent.
>>>>> +  */
>>>>> + ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
>>>>> + ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
>>>>> + ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
>>>>> +
>>>>> + xfs_fill_efd_with_efi(efdp);
>>>>> +
>>>>> + /*
>>>>> +  * A preious extent in same AG is processed with the current
>>>>> +  * transaction. To avoid possible AGFL allocation deadlock,
>>>>> +  * we roll the transaction and then restart with this extent
>>>>> +  * with new transaction.
>>>>> +  */
>>>>> + return -EAGAIN;
>>>>> + }
>>>>> 
>>>>> oinfo.oi_owner = xefi->xefi_owner;
>>>>> if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
>>>>> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
>>>>> error = __xfs_free_extent(tp, xefi->xefi_startblock,
>>>>> xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
>>>>> xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
>>>>> + if (error) {
>>>>> + struct xfs_perag *pag;
>>>>> +
>>>>> + pag = xfs_perag_get(mp, agno);
>>>>> + atomic_dec(&pag->pag_nr_pending_extents);
>>>>> + xfs_perag_put(pag);
>>>>> + }
>>>>> /*
>>>>> * Mark the transaction dirty, even on error. This ensures the
>>>>> * transaction is aborted, which:
>>>>> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
>>>> 
>>>> This function comes with an unused **state variable:
>>>> 
>>> 
>>> I don’t see why we need to take different action according to
>>> the ‘state’.  
>>> 
>>>> STATIC int
>>>> xfs_extent_free_finish_item(
>>>> struct xfs_trans *tp,
>>>> struct xfs_log_item *done,
>>>> struct list_head *item,
>>>> struct xfs_btree_cur **state)
>>>> 
>>>> What if, after each xfs_trans_free_extent call, we stuff *state with
>>>> xefi_startblock's AG?
>>>> 
>>> 
>>> see Above.
>>> 
>>>> Then, upon entry to xfs_extent_free_finish_item, we compare *state with
>>>> the xefi item and return EAGAIN if we're processing an EFI from the same
>>>> AG as the previous call to xfs_extent_free_finish_item?  Something like
>>>> this (totally untested):
>>>> 
>>>> STATIC int
>>>> xfs_extent_free_finish_item(
>>>> struct xfs_trans *tp,
>>>> struct xfs_log_item *done,
>>>> struct list_head *item,
>>>> struct xfs_btree_cur **state)
>>>> {
>>>> struct xfs_extent_free_item *xefi;
>>>> int error;
>>>> 
>>>> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
>>>> 
>>>> if (*state) {
>>>> struct xfs_perag *oldpag = *(struct xfs_perag **)state;
>>>> 
>>>> /*
>>>> * If we're freeing two extents from the same AG, we
>>>> * must roll the transaction between the two extents to
>>>> * avoid a livelock resulting from AGFL fixing waiting
>>>> * on the extent that we just freed to become unbusy.
>>>> */
>>>> if (oldpag == xefi->xefi_pag) {
>>>> xfs_fill_efd_with_efi(EFD_ITEM(done));
>>>> xfs_perag_put(oldpag);
>>>> return -EAGAIN;
>>>> }
>>> 
>>> As I said the first record would let go, we don’t care previous AG.
>>> 
>>>> xfs_perag_put(oldpag);
>>>> }
>>>> 
>>>> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
>>>> if (!error)
>>>> *state = (void *)xfs_perag_hold(xefi->xefi_pag);
>>>> 
>>>> xfs_extent_free_put_group(xefi);
>>>> kmem_cache_free(xfs_extfree_item_cache, xefi);
>>>> return error;
>>>> }
>>>> 
>>>> Obviously, you now need a ->finish_cleanup function to drop the perag
>>>> reference that may be stashed in @state.  This (I think) avoids the
>>>> livelock by ensuring that xfs_trans_free_extent always starts with a
>>>> transaction that does not contain any busy extents from the same AG that
>>>> it's acting on, right?
>>> 
>>> Nope, we don’t need to pass perag in state. Still the rule:
>>> 
>>> The first record in an EFI (no matter it’s the original EFI or new EFIs) just
>>> goes (because it won’t hit the deadlock).
>>> 
>>> For the 2nd or subsequent records, we need to see if there are any pending
>>> busy_extent to the same AG as the record belongs to. “pending” means
>>> those are still in xfs_trans->t_busy (not committed to xlog_cil_pcp->busy_extents
>>> yet). We know that by checking the newly introduced per AG counter
>>> pag_nr_pending_extents. If there are, we retry the record in a new EFI
>>> in a new transaction after committing the original transaction. With in the new
>>> EFI, the 2nd record in the original EFI become the 1st record, and it just can
>>> go (because it won’t hit the deadlock).
>> 
>> <nod> So in other words, the pag_nr_pending_extents counter buys us the
>> performance advantage that we don't need to add the extra roll between
>> (3/X) and (3/Y) if the log is running fast enough to clear the busy
>> extent before we even start evaluating the relogged EFI (with 3/Y in
>> it).
> 
> I’d think there needs a transaction roll between (3/X) and (3/Y). If we don’t do that
> we would have a “pending” busy extent in current xfs_trans->t_busy while processing
> (3/Y). And that busy extent had no chance to be cleared because it’s not committed
> to xlog_cil_pcp->busy_extents yet and CIL flushing won’t take care of busy extents
> which are not in busy_extents. So if no transaction roll in (3/X) and (3/Y), we still
> have the risk to the dead lock when allocating blocks for AGFL.
> 
>> 
>> How often does it happen that we have to roll the transaction and relog
>> the EFI?
> 
> It could depend on the use case. This may happen when truncating files.
> 
> 
>> 
>>> We are using per AG counter here to know if there are pending busy extents
>>> rather than comparing with the involved previous AGs in the same EFI to avoid
>>> also the potential ABBA dead lock:
>>> thread 1:  {3/X, 4/Y}
>>> thread 2:  {4/Z, 3/T}
>>> 
>>>> 
>>>> A simpler version of the above would make it so that we always roll
>>>> between EFI records and don't have to manage perag references:
>>>> 
>>>> #define XFS_EFI_STATE_ROLL ((struct xfs_btree_cur *)1)
>>>> STATIC int
>>>> xfs_extent_free_finish_item(
>>>> struct xfs_trans *tp,
>>>> struct xfs_log_item *done,
>>>> struct list_head *item,
>>>> struct xfs_btree_cur **state)
>>>> {
>>>> struct xfs_extent_free_item *xefi;
>>>> int error;
>>>> 
>>>> /*
>>>> * If we're freeing two extents, roll the transaction between
>>>> * the two extents to avoid a livelock resulting from AGFL
>>>> * fixing waiting on the extent that we just freed to become
>>>> * unbusy.  It's not necessary to roll if the previous and
>>>> * current EFI record point to different AGs, but this
>>>> * simplifies the state tracking.
>>>> */
>>>> if (*state == XFS_EFI_STATE_ROLL) {
>>>> xfs_fill_efd_with_efi(EFD_ITEM(done));
>>>> *state = NULL;
>>>> return -EAGAIN;
>>>> }
>>>> 
>>>> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
>>>> 
>>>> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
>>>> if (!error)
>>>> *state = XFS_EFI_STATE_ROLL;
>>>> 
>>>> xfs_extent_free_put_group(xefi);
>>>> kmem_cache_free(xfs_extfree_item_cache, xefi);
>>>> return error;
>>>> }
>>>> 
>>> 
>>> By this way, it looks to me that the original EFI is always splitted into several ones
>>> each including only records.  It’s the same result as we change 
>>> XFS_EFI_MAX_FAST_EXTENTS to 1.  Dave doesn’t like that.
>> 
>> Yes.
>> 
>>> My implantation split them only when necessary as Dave suggested.
>>> 
>>>> (Did you and Dave already talk about this?)
>>> 
>>> There many discussion in this two threads:
>>> [PATCH 2/2] xfs: log recovery stage split EFIs with multiple extents
>>> [PATCH 1/2] xfs: IO time one extent per EFI
>> 
>> Ok, I'll go reread all that.
> 
> thanks,
> wengang



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-16 16:05         ` Wengang Wang
@ 2023-05-17  0:59           ` Darrick J. Wong
  2023-05-17  1:40             ` Dave Chinner
  0 siblings, 1 reply; 16+ messages in thread
From: Darrick J. Wong @ 2023-05-17  0:59 UTC (permalink / raw)
  To: Wengang Wang; +Cc: linux-xfs@vger.kernel.org

On Tue, May 16, 2023 at 04:05:20PM +0000, Wengang Wang wrote:
> Hi Darrick, do you have further comments?

Now I do.  I chatted with Dave earlier about this, since I think it's a
layering violation to have the EFI and the CLI (aka the logging layer)
banging on an atomic counter in the perag structure.

(More later on below)

> thanks,
> wengang
> 
> > On May 12, 2023, at 3:20 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote:
> > 
> > 
> > 
> >> On May 12, 2023, at 2:13 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> >> 
> >> On Fri, May 12, 2023 at 07:55:28PM +0000, Wengang Wang wrote:
> >>> Hi Darrick,
> >>> 
> >>>> On May 12, 2023, at 11:24 AM, Darrick J. Wong <djwong@kernel.org> wrote:
> >>>> 
> >>>> Sorry for the slow response, I thought Dave would've responded by now.
> >>>> 
> >>>> On Mon, Apr 24, 2023 at 03:51:02PM -0700, Wengang Wang wrote:
> >>>>> To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
> >>>>> The orignal hehaviour for freeing extents in an EFI is that the extents in
> >>>>> the EFI were processed one by one in the same transaction. When the second
> >>>>> and subsequent extents are being processed, we have produced busy extents for
> >>>>> previous extents. If the second and subsequent extents are from the same AG
> >>>>> as the busy extents are, we have the risk of deadlock when allocating AGFL
> >>>>> blocks. A typical calltrace for the deadlock is like this:
> >>>> 
> >>>> It's been a few weeks, so let me try to remember what this is all about.
> >>>> You have one EFI containing multiple extents to free:
> >>>> 
> >>>> {agno: 3, agbno: X, ... }
> >>>> {agno: 3, agbno: Y, ... }
> >>>> 
> >>>> AG 3 is nearly full, so we free the first extent (3/X), which adds it to
> >>>> the bnobt, records the newly freed extent in the extentbusylist, and
> >>>> attaches the record to the transaction's busy extent list.
> >>>> 
> >>>> Then we move on to the second record in the EFI item.  We want to free
> >>>> (3/Y), but whoops the AGFL isn't big enough to handle maximal expansion
> >>>> of the bnobt/cntbt btrees.
> >>>> 
> >>>> So we try to fix the AG 3 AGFL to be long enough.  We can find the space
> >>>> in the bnobt, but the only space available is the (3/X) that we put
> >>>> there during the last step.  That space is still(!) busy and still
> >>>> attached to the transaction, so the CIL cannot clear it.  The AGFL fixer
> >>>> sees that the space is busy, so it waits for it... and now we're dead in
> >>>> the water.
> >>>> 
> >>>> The key to this deadlock is a thread waiting on its own uncommitted busy
> >>>> extent, right?
> >>>> 
> >>> 
> >>> Yep, above is all correct.
> >>> 
> >>>>> 
> >>>>> #0 context_switch() kernel/sched/core.c:3881
> >>>>> #1 __schedule() kernel/sched/core.c:5111
> >>>>> #2 schedule() kernel/sched/core.c:5186
> >>>>> #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
> >>>>> #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
> >>>>> #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
> >>>>> #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
> >>>>> #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
> >>>>> #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
> >>>>> #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
> >>>>> #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
> >>>>> #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
> >>>>> #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
> >>>>> #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
> >>>>> #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764
> >>>>> #15 xfs_mountfs() fs/xfs/xfs_mount.c:978
> >>>>> #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
> >>>>> #17 mount_bdev() fs/super.c:1417
> >>>>> #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985
> >>>>> #19 legacy_get_tree() fs/fs_context.c:647
> >>>>> #20 vfs_get_tree() fs/super.c:1547
> >>>>> #21 do_new_mount() fs/namespace.c:2843
> >>>>> #22 do_mount() fs/namespace.c:3163
> >>>>> #23 ksys_mount() fs/namespace.c:3372
> >>>>> #24 __do_sys_mount() fs/namespace.c:3386
> >>>>> #25 __se_sys_mount() fs/namespace.c:3383
> >>>>> #26 __x64_sys_mount() fs/namespace.c:3383
> >>>>> #27 do_syscall_64() arch/x86/entry/common.c:296
> >>>>> #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180
> >>>>> 
> >>>>> The deadlock could happen at both IO time and log recover time.
> >>>>> 
> >>>>> To avoid above deadlock, this patch changes the extent free procedure.
> >>>>> 1) it always let the first extent from the EFI go (no change).
> >>>>> 2) increase the (new) AG counter when it let a extent go.
> >>>>> 3) for the 2nd+ extents, if the owning AGs ready have pending extents
> >>>>> don't let the extent go with current transaction. Instead, move the
> >>>>> extent in question and subsequent extents to a new EFI and try the new
> >>>>> EFI again with new transaction (by rolling current transaction).
> >>>>> 4) for the EFD to orginal EFI, fill it with all the extents from the original
> >>>>> EFI.
> >>>>> 5) though the new EFI is placed after original EFD, it's safe as they are in
> >>>>> same in-memory transaction.
> >>>>> 6) The new AG counter for pending extent freeings is decremented after the
> >>>>> log items in in-memory transaction is committed to CIL.
> >>>> 
> >>>> So the solution you're proposing is a per-AG counter of however many
> >>>> threads have started an EFI free.  If the second (or subsequent) EFI
> >>>> free encounters an AG with a nonzero counter, it'll return EAGAIN to
> >>>> cycle the transaction.  The cycling is critical to getting the busy
> >>>> extent to the CIL so the log can process it and make it unbusy.  If the
> >>>> AG wasn't contended (pag_nr_pending_extents was 0), it proceeds with the
> >>>> freeing, having bumped the per-AG counter.  Right?
> >>> 
> >>> Right.
> >>> 
> >>>> 
> >>>> Therefore, the above deadlock sequence now becomes:
> >>>> 
> >>>> 1. Free (3/X), placing (3/X) on the transaction's busy list.
> >>>> 
> >>>> 2. Start trying to free (3/Y), notice that AG 3 has elevated
> >>>> pag_nr_pending_extents, return EAGAIN
> >>>> 
> >>>> 3. Roll transaction, which moves (3/X) to the CIL and gets us a fresh
> >>>> transaction
> >>>> 
> >>>> 4. Try to free (3/Y) again
> >>>> 
> >>>> At step 4, if pag_nr_pending_extents is still elevated, does that
> >>>> mean we return EAGAIN and keep rolling the transaction until it's not
> >>>> elevated?  Shouldn't we be able to proceed at step 4, even if we end up
> >>>> waiting on the log to clear (3/X) from the busy list?
> >>> 
> >>> The pag_nr_pending_extents is expected to be decremented in step 3.
> >> 
> >> OH.  I missed (and you pointed out below) the subtlety that step 3 gives
> >> us a fresh transaction /and/ a fresh EFI, so after the roll, (3/Y) is
> >> the "first" EFI and never has to wait.
> > 
> > Yes exactly.
> > 
> >> 
> >>> The changes in xlog_cil_insert_items() is supposed to do so.  So at
> >>> step 4, for this cass {3/x, 3/y}, pag_nr_pending_extents should be
> >>> decremented to zero, thus step 4 can go (after that busy extent 3/x is
> >>> cleared).
> >> 
> >> So even if the log is being slow and the CIL hasn't cleared the busy
> >> extent (3/X) by the time we finish rolling the tx and relogging the EFI,
> >> the relogged EFI will start with (3/Y) and move forward without waiting.
> >> Got it.
> > 
> > Correct. There is no waiting while we start to process the EFI records,
> > The point here is that we process the records in the existing transaction or
> > in new transactions (rolling current one).

I think the exact point where the filesystem stalls is the
xfs_extent_busy_flush calls in xfs_alloc_ag_vextent_size, right?

Prior to 6.3, this was:

xfs_alloc_fix_freelist ->
XFS_ALLOCTYPE_THIS_AG ->
xfs_alloc_ag_vextent ->
xfs_alloc_ag_vextent_size ->
(run all the way to the end of the bnobt) ->
xfs_extent_busy_flush ->
<stall on the busy extent that's in @tp->busy_list>

Since 6.3 we got rid of the _THIS_AG indirection stuff and that becomes:

xfs_alloc_fix_freelist ->
xfs_alloc_ag_vextent_size ->
(run all the way to the end of the bnobt) ->
xfs_extent_busy_flush ->
<stall on the busy extent that's in @tp->busy_list>

xfs_extent_busy_flush does this, potentially while we're holding the
freed extent in @tp->t_busy_list:

	error = xfs_log_force(mp, XFS_LOG_SYNC);
	if (error)
		return;

	do {
		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
		if  (busy_gen != READ_ONCE(pag->pagb_gen))
			break;
		schedule();
	} while (1);

	finish_wait(&pag->pagb_wait, &wait);

The log force kicks the CIL to process whatever other committed items
might be lurking in the log.  *Hopefully* someone else freed an extent
in the same AG, so the log force has now caused that *other* extent to
get processed so it has now cleared the busy list.  Clearing something
from the busy list increments the busy generation (aka pagb_gen).

Unfortunately, there aren't any other extents, so the busy_gen does not
change, and the loop runs forever.

At this point, Dave writes:

[15:57] <dchinner> so if we enter that function with busy extents on the
transaction, and we are doing an extent free operation, we should return
after the sync log force and not do the generation number wait

[15:58] <dchinner> if we fail to allocate again after the sync log force
and the generation number hasn't changed, then return -EAGAIN because no
progress has been made.

[15:59] <dchinner> Then the transaction is rolled, the transaction busy
list is cleared, and if the next allocation attempt fails becaues
everything is busy, we go to sleep waiting for the generation to change

[16:00] <dchinner> but because the transaction does not hold any busy
extents, it cannot deadlock here because it does not pin any extents
that are in the busy tree....

[16:05] <dchinner> All the generation number is doing here is telling us
whether there was busy extent resolution between the time we last
skipped a viable extent because it was busy and when the flush
completes.

[16:06] <dchinner> it doesn't mean the next allocation will succeed,
just that progress has been made so trying the allocation attempt will
at least get a different result to the previous scan.

I think the callsites go from this:

	if (busy) {
		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
		trace_xfs_alloc_size_busy(args);
		xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
		goto restart;
	}

to something like this:

	bool	try_log_flush = true;
	...
restart:
	...

	if (busy) {
		bool	progress;

		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
		trace_xfs_alloc_size_busy(args);

		/*
		 * If the current transaction has an extent on the busy
		 * list, we're allocating space as part of freeing
		 * space, and all the free space is busy, we can't hang
		 * here forever.  Force the log to try to unbusy free
		 * space that could have been freed by other
		 * transactions, and retry the allocation.  If the
		 * allocation fails a second time because all the free
		 * space is busy and nobody made any progress with
		 * clearing busy extents, return EAGAIN so the caller
		 * can roll this transaction.
		 */
		if ((flags & XFS_ALLOC_FLAG_FREEING) &&
		    !list_empty(&tp->t_busy_list)) {
			int log_flushed;

			if (try_log_flush) {
				_xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
				try_log_flush = false;
				goto restart;
			}

			if (busy_gen == READ_ONCE(pag->pagb_gen))
				return -EAGAIN;

			/* XXX should we set try_log_flush = true? */
			goto restart;
		}

		xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
		goto restart;
	}

IOWs, I think Dave wants us to keep the changes in the allocator instead
of spreading it around.

--D

> > 
> >> 
> >>>> 
> >>>> What happens if the log actually clears (3/X) from the busy list after
> >>>> step 3 but then some other thread starts processing an EFI for (3/Z)?
> >>> 
> >>> There is a rule in the patch that the first record in an EFI can always go (bumping
> >>> the per AG counter). That’s because the freeing the first record never has
> >>> the deadlock issue.
> >>> 
> >>> The 3/Y would go because now the 3/Y is the first record of the new EFI
> >>> containing only record of 3/Y.
> >> 
> >> Oh, ok.  I missed that.
> >> 
> >>> For the other thread for 3/Z,  3/Z can go too because it’s the first record in
> >>> that EFI (bumping the per AG counter, now the counter could be 3 at most).
> >> 
> >> <nod>
> >> 
> >>> 
> >>>> Does that cause this thread to roll repeatedly waiting for another
> >>>> thread's EFI to clear?
> >>> 
> >>> Nope. As I said, the first record can always go. For the 2nd and subsequent
> >>> record, on retry, they would become the first record in the new FEI and it goes.
> >> 
> >> <nod>
> >> 
> >>>> 
> >>>> IOWs, I'm not sure we can always make forward progress with this scheme,
> >>>> and I'd prefer to avoid introducing more global state.
> >>> 
> >>> See above explanation.
> >>> 
> >>>> 
> >>>>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> >>>>> ---
> >>>>> fs/xfs/libxfs/xfs_ag.c    |   1 +
> >>>>> fs/xfs/libxfs/xfs_ag.h    |   5 ++
> >>>>> fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
> >>>>> fs/xfs/xfs_log_cil.c      |  24 ++++++++-
> >>>>> 4 files changed, 138 insertions(+), 3 deletions(-)
> >>>>> 
> >>>>> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> >>>>> index 86696a1c6891..61ef61e05668 100644
> >>>>> --- a/fs/xfs/libxfs/xfs_ag.c
> >>>>> +++ b/fs/xfs/libxfs/xfs_ag.c
> >>>>> @@ -378,6 +378,7 @@ xfs_initialize_perag(
> >>>>> pag->pagb_tree = RB_ROOT;
> >>>>> #endif /* __KERNEL__ */
> >>>>> 
> >>>>> + atomic_set(&pag->pag_nr_pending_extents, 0);
> >>>>> error = xfs_buf_hash_init(pag);
> >>>>> if (error)
> >>>>> goto out_remove_pag;
> >>>>> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> >>>>> index 5e18536dfdce..5950bc36a32c 100644
> >>>>> --- a/fs/xfs/libxfs/xfs_ag.h
> >>>>> +++ b/fs/xfs/libxfs/xfs_ag.h
> >>>>> @@ -82,6 +82,11 @@ struct xfs_perag {
> >>>>> uint16_t pag_sick;
> >>>>> spinlock_t pag_state_lock;
> >>>>> 
> >>>>> + /*
> >>>>> +  * Number of concurrent extent freeings (not committed to CIL yet)
> >>>>> +  * on this AG.
> >>>>> +  */
> >>>>> + atomic_t pag_nr_pending_extents;
> >>>>> spinlock_t pagb_lock; /* lock for pagb_tree */
> >>>>> struct rb_root pagb_tree; /* ordered tree of busy extents */
> >>>>> unsigned int pagb_gen; /* generation count for pagb_tree */
> >>>>> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> >>>>> index 011b50469301..1dbf36d9c1c9 100644
> >>>>> --- a/fs/xfs/xfs_extfree_item.c
> >>>>> +++ b/fs/xfs/xfs_extfree_item.c
> >>>>> @@ -336,6 +336,75 @@ xfs_trans_get_efd(
> >>>>> return efdp;
> >>>>> }
> >>>>> 
> >>>>> +/*
> >>>>> + * Fill the EFD with all extents from the EFI and set the counter.
> >>>>> + * Note: the EFD should comtain at least one extents already.
> >>>>> + */
> >>>>> +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
> >>>>> +{
> >>>>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
> >>>>> + uint i;
> >>>>> +
> >>>>> + i = efdp->efd_next_extent;
> >>>>> + ASSERT(i > 0);
> >>>>> + for (; i < efip->efi_format.efi_nextents; i++) {
> >>>>> + efdp->efd_format.efd_extents[i] =
> >>>>> + efip->efi_format.efi_extents[i];
> >>>> 
> >>>> Why is it necessary to fill the EFD mapping structures?  Nobody /ever/
> >>>> looks at those; the only part that matters to log recovery is matching
> >>>> efd.efd_efi_id to efi.efi_id.
> >>> 
> >>> Yes, that’s for EFI/EFD matching for log recovery.
> >>> The records to be retried would be in a new EFI.
> >>> 
> >>>> 
> >>>> But, I guess it looks funny to requeue an EFI and have the EFD for the
> >>>> old EFI missing a bunch of fields.
> >>> 
> >>> The EFD to the original EFI is not missing anything, it is expected
> >>> to be exact the same as if all the records are processed.
> >>> By this way we move records to new EFI(s) to avoid the deadlock.
> >>> 
> >>>> 
> >>>>> + }
> >>>>> + efdp->efd_next_extent = i;
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Check if xefi is the first in the efip.
> >>>>> + * Returns true if so, ad false otherwise
> >>>>> + */
> >>>>> +static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
> >>>>> +   struct xfs_extent_free_item *xefi)
> >>>>> +{
> >>>>> + return efip->efi_format.efi_extents[0].ext_start ==
> >>>>> + xefi->xefi_startblock;
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Check if the xefi needs to be in a new transaction.
> >>>>> + * efip is the containing EFI of xefi.
> >>>>> + * Return true if so, false otherwise.
> >>>>> + */
> >>>>> +static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
> >>>>> +     struct xfs_efi_log_item *efip,
> >>>>> +     struct xfs_extent_free_item *xefi)
> >>>>> +{
> >>>>> + bool ret = true;
> >>>>> + int nr_pre;
> >>>>> + xfs_agnumber_t agno;
> >>>>> + struct xfs_perag *pag;
> >>>>> +
> >>>>> + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
> >>>>> + pag = xfs_perag_get(mp, agno);
> >>>>> + /* The first extent in EFI is always OK to go */
> >>>>> + if (xfs_is_first_extent_in_efi(efip, xefi)) {
> >>>>> + atomic_inc(&pag->pag_nr_pending_extents);
> >>>>> + ret = false;
> >>>>> + goto out_put;
> >>>>> + }
> >>>>> +
> >>>>> + /*
> >>>>> +  * Now the extent is the 2nd or subsequent in the efip. We need
> >>>>> +  * new transaction if the AG already has busy extents pending.
> >>>>> +  */
> >>>>> + nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
> >>>>> + /* No prevoius pending extent freeing to this AG */
> >>>>> + if (nr_pre == 0) {
> >>>>> + ret = false;
> >>>>> + goto out_put;
> >>>>> + }
> >>>>> +
> >>>>> + atomic_dec(&pag->pag_nr_pending_extents);
> >>>>> +out_put:
> >>>>> + xfs_perag_put(pag);
> >>>>> + return ret;
> >>>>> +}
> >>>>> +
> >>>>> /*
> >>>>> * Free an extent and log it to the EFD. Note that the transaction is marked
> >>>>> * dirty regardless of whether the extent free succeeds or fails to support the
> >>>>> @@ -356,6 +425,28 @@ xfs_trans_free_extent(
> >>>>> xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
> >>>>> xefi->xefi_startblock);
> >>>>> int error;
> >>>>> + struct xfs_efi_log_item *efip = efdp->efd_efip;
> >>>>> +
> >>>>> + if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
> >>>>> + /*
> >>>>> +  * This should be the 2nd+ extent, we don't have to mark the
> >>>>> +  * transaction and efd dirty, those are already done with the
> >>>>> +  * first extent.
> >>>>> +  */
> >>>>> + ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
> >>>>> + ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
> >>>>> + ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
> >>>>> +
> >>>>> + xfs_fill_efd_with_efi(efdp);
> >>>>> +
> >>>>> + /*
> >>>>> +  * A preious extent in same AG is processed with the current
> >>>>> +  * transaction. To avoid possible AGFL allocation deadlock,
> >>>>> +  * we roll the transaction and then restart with this extent
> >>>>> +  * with new transaction.
> >>>>> +  */
> >>>>> + return -EAGAIN;
> >>>>> + }
> >>>>> 
> >>>>> oinfo.oi_owner = xefi->xefi_owner;
> >>>>> if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
> >>>>> @@ -369,6 +460,13 @@ xfs_trans_free_extent(
> >>>>> error = __xfs_free_extent(tp, xefi->xefi_startblock,
> >>>>> xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
> >>>>> xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
> >>>>> + if (error) {
> >>>>> + struct xfs_perag *pag;
> >>>>> +
> >>>>> + pag = xfs_perag_get(mp, agno);
> >>>>> + atomic_dec(&pag->pag_nr_pending_extents);
> >>>>> + xfs_perag_put(pag);
> >>>>> + }
> >>>>> /*
> >>>>> * Mark the transaction dirty, even on error. This ensures the
> >>>>> * transaction is aborted, which:
> >>>>> @@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
> >>>> 
> >>>> This function comes with an unused **state variable:
> >>>> 
> >>> 
> >>> I don’t see why we need to take different action according to
> >>> the ‘state’.  
> >>> 
> >>>> STATIC int
> >>>> xfs_extent_free_finish_item(
> >>>> struct xfs_trans *tp,
> >>>> struct xfs_log_item *done,
> >>>> struct list_head *item,
> >>>> struct xfs_btree_cur **state)
> >>>> 
> >>>> What if, after each xfs_trans_free_extent call, we stuff *state with
> >>>> xefi_startblock's AG?
> >>>> 
> >>> 
> >>> see Above.
> >>> 
> >>>> Then, upon entry to xfs_extent_free_finish_item, we compare *state with
> >>>> the xefi item and return EAGAIN if we're processing an EFI from the same
> >>>> AG as the previous call to xfs_extent_free_finish_item?  Something like
> >>>> this (totally untested):
> >>>> 
> >>>> STATIC int
> >>>> xfs_extent_free_finish_item(
> >>>> struct xfs_trans *tp,
> >>>> struct xfs_log_item *done,
> >>>> struct list_head *item,
> >>>> struct xfs_btree_cur **state)
> >>>> {
> >>>> struct xfs_extent_free_item *xefi;
> >>>> int error;
> >>>> 
> >>>> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> >>>> 
> >>>> if (*state) {
> >>>> struct xfs_perag *oldpag = *(struct xfs_perag **)state;
> >>>> 
> >>>> /*
> >>>> * If we're freeing two extents from the same AG, we
> >>>> * must roll the transaction between the two extents to
> >>>> * avoid a livelock resulting from AGFL fixing waiting
> >>>> * on the extent that we just freed to become unbusy.
> >>>> */
> >>>> if (oldpag == xefi->xefi_pag) {
> >>>> xfs_fill_efd_with_efi(EFD_ITEM(done));
> >>>> xfs_perag_put(oldpag);
> >>>> return -EAGAIN;
> >>>> }
> >>> 
> >>> As I said the first record would let go, we don’t care previous AG.
> >>> 
> >>>> xfs_perag_put(oldpag);
> >>>> }
> >>>> 
> >>>> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> >>>> if (!error)
> >>>> *state = (void *)xfs_perag_hold(xefi->xefi_pag);
> >>>> 
> >>>> xfs_extent_free_put_group(xefi);
> >>>> kmem_cache_free(xfs_extfree_item_cache, xefi);
> >>>> return error;
> >>>> }
> >>>> 
> >>>> Obviously, you now need a ->finish_cleanup function to drop the perag
> >>>> reference that may be stashed in @state.  This (I think) avoids the
> >>>> livelock by ensuring that xfs_trans_free_extent always starts with a
> >>>> transaction that does not contain any busy extents from the same AG that
> >>>> it's acting on, right?
> >>> 
> >>> Nope, we don’t need to pass perag in state. Still the rule:
> >>> 
> >>> The first record in an EFI (no matter it’s the original EFI or new EFIs) just
> >>> goes (because it won’t hit the deadlock).
> >>> 
> >>> For the 2nd or subsequent records, we need to see if there are any pending
> >>> busy_extent to the same AG as the record belongs to. “pending” means
> >>> those are still in xfs_trans->t_busy (not committed to xlog_cil_pcp->busy_extents
> >>> yet). We know that by checking the newly introduced per AG counter
> >>> pag_nr_pending_extents. If there are, we retry the record in a new EFI
> >>> in a new transaction after committing the original transaction. With in the new
> >>> EFI, the 2nd record in the original EFI become the 1st record, and it just can
> >>> go (because it won’t hit the deadlock).
> >> 
> >> <nod> So in other words, the pag_nr_pending_extents counter buys us the
> >> performance advantage that we don't need to add the extra roll between
> >> (3/X) and (3/Y) if the log is running fast enough to clear the busy
> >> extent before we even start evaluating the relogged EFI (with 3/Y in
> >> it).
> > 
> > I’d think there needs a transaction roll between (3/X) and (3/Y). If we don’t do that
> > we would have a “pending” busy extent in current xfs_trans->t_busy while processing
> > (3/Y). And that busy extent had no chance to be cleared because it’s not committed
> > to xlog_cil_pcp->busy_extents yet and CIL flushing won’t take care of busy extents
> > which are not in busy_extents. So if no transaction roll in (3/X) and (3/Y), we still
> > have the risk to the dead lock when allocating blocks for AGFL.
> > 
> >> 
> >> How often does it happen that we have to roll the transaction and relog
> >> the EFI?
> > 
> > It could depend on the use case. This may happen when truncating files.
> > 
> > 
> >> 
> >>> We are using per AG counter here to know if there are pending busy extents
> >>> rather than comparing with the involved previous AGs in the same EFI to avoid
> >>> also the potential ABBA dead lock:
> >>> thread 1:  {3/X, 4/Y}
> >>> thread 2:  {4/Z, 3/T}
> >>> 
> >>>> 
> >>>> A simpler version of the above would make it so that we always roll
> >>>> between EFI records and don't have to manage perag references:
> >>>> 
> >>>> #define XFS_EFI_STATE_ROLL ((struct xfs_btree_cur *)1)
> >>>> STATIC int
> >>>> xfs_extent_free_finish_item(
> >>>> struct xfs_trans *tp,
> >>>> struct xfs_log_item *done,
> >>>> struct list_head *item,
> >>>> struct xfs_btree_cur **state)
> >>>> {
> >>>> struct xfs_extent_free_item *xefi;
> >>>> int error;
> >>>> 
> >>>> /*
> >>>> * If we're freeing two extents, roll the transaction between
> >>>> * the two extents to avoid a livelock resulting from AGFL
> >>>> * fixing waiting on the extent that we just freed to become
> >>>> * unbusy.  It's not necessary to roll if the previous and
> >>>> * current EFI record point to different AGs, but this
> >>>> * simplifies the state tracking.
> >>>> */
> >>>> if (*state == XFS_EFI_STATE_ROLL) {
> >>>> xfs_fill_efd_with_efi(EFD_ITEM(done));
> >>>> *state = NULL;
> >>>> return -EAGAIN;
> >>>> }
> >>>> 
> >>>> xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
> >>>> 
> >>>> error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
> >>>> if (!error)
> >>>> *state = XFS_EFI_STATE_ROLL;
> >>>> 
> >>>> xfs_extent_free_put_group(xefi);
> >>>> kmem_cache_free(xfs_extfree_item_cache, xefi);
> >>>> return error;
> >>>> }
> >>>> 
> >>> 
> >>> By this way, it looks to me that the original EFI is always splitted into several ones
> >>> each including only records.  It’s the same result as we change 
> >>> XFS_EFI_MAX_FAST_EXTENTS to 1.  Dave doesn’t like that.
> >> 
> >> Yes.
> >> 
> >>> My implantation split them only when necessary as Dave suggested.
> >>> 
> >>>> (Did you and Dave already talk about this?)
> >>> 
> >>> There many discussion in this two threads:
> >>> [PATCH 2/2] xfs: log recovery stage split EFIs with multiple extents
> >>> [PATCH 1/2] xfs: IO time one extent per EFI
> >> 
> >> Ok, I'll go reread all that.
> > 
> > thanks,
> > wengang
> 
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-17  0:59           ` Darrick J. Wong
@ 2023-05-17  1:40             ` Dave Chinner
  2023-05-17  1:54               ` Darrick J. Wong
  2023-05-17 19:14               ` Wengang Wang
  0 siblings, 2 replies; 16+ messages in thread
From: Dave Chinner @ 2023-05-17  1:40 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Wengang Wang, linux-xfs@vger.kernel.org

On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
> Since 6.3 we got rid of the _THIS_AG indirection stuff and that becomes:
> 
> xfs_alloc_fix_freelist ->
> xfs_alloc_ag_vextent_size ->
> (run all the way to the end of the bnobt) ->
> xfs_extent_busy_flush ->
> <stall on the busy extent that's in @tp->busy_list>
> 
> xfs_extent_busy_flush does this, potentially while we're holding the
> freed extent in @tp->t_busy_list:
> 
> 	error = xfs_log_force(mp, XFS_LOG_SYNC);
> 	if (error)
> 		return;
> 
> 	do {
> 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> 		if  (busy_gen != READ_ONCE(pag->pagb_gen))
> 			break;
> 		schedule();
> 	} while (1);
> 
> 	finish_wait(&pag->pagb_wait, &wait);
> 
> The log force kicks the CIL to process whatever other committed items
> might be lurking in the log.  *Hopefully* someone else freed an extent
> in the same AG, so the log force has now caused that *other* extent to
> get processed so it has now cleared the busy list.  Clearing something
> from the busy list increments the busy generation (aka pagb_gen).

*nod*

> Unfortunately, there aren't any other extents, so the busy_gen does not
> change, and the loop runs forever.
> 
> At this point, Dave writes:
> 
> [15:57] <dchinner> so if we enter that function with busy extents on the
> transaction, and we are doing an extent free operation, we should return
> after the sync log force and not do the generation number wait
> 
> [15:58] <dchinner> if we fail to allocate again after the sync log force
> and the generation number hasn't changed, then return -EAGAIN because no
> progress has been made.
> 
> [15:59] <dchinner> Then the transaction is rolled, the transaction busy
> list is cleared, and if the next allocation attempt fails becaues
> everything is busy, we go to sleep waiting for the generation to change
> 
> [16:00] <dchinner> but because the transaction does not hold any busy
> extents, it cannot deadlock here because it does not pin any extents
> that are in the busy tree....
> 
> [16:05] <dchinner> All the generation number is doing here is telling us
> whether there was busy extent resolution between the time we last
> skipped a viable extent because it was busy and when the flush
> completes.
> 
> [16:06] <dchinner> it doesn't mean the next allocation will succeed,
> just that progress has been made so trying the allocation attempt will
> at least get a different result to the previous scan.
> 
> I think the callsites go from this:
> 
> 	if (busy) {
> 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> 		trace_xfs_alloc_size_busy(args);
> 		xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
> 		goto restart;
> 	}

I was thinking this code changes to:

	flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
	....
	<attempt allocation>
	....
	if (busy) {
		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
		trace_xfs_alloc_size_busy(args);
		error = xfs_extent_busy_flush(args->tp, args->pag,
				busy_gen, flags);
		if (!error) {
			flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
			goto restart;
		}
		/* jump to cleanup exit point */
		goto out_error;
	}

Note the different first parameter - we pass args->tp, not args->mp
so that the flush has access to the transaction context...

> to something like this:
> 
> 	bool	try_log_flush = true;
> 	...
> restart:
> 	...
> 
> 	if (busy) {
> 		bool	progress;
> 
> 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> 		trace_xfs_alloc_size_busy(args);
> 
> 		/*
> 		 * If the current transaction has an extent on the busy
> 		 * list, we're allocating space as part of freeing
> 		 * space, and all the free space is busy, we can't hang
> 		 * here forever.  Force the log to try to unbusy free
> 		 * space that could have been freed by other
> 		 * transactions, and retry the allocation.  If the
> 		 * allocation fails a second time because all the free
> 		 * space is busy and nobody made any progress with
> 		 * clearing busy extents, return EAGAIN so the caller
> 		 * can roll this transaction.
> 		 */
> 		if ((flags & XFS_ALLOC_FLAG_FREEING) &&
> 		    !list_empty(&tp->t_busy_list)) {
> 			int log_flushed;
> 
> 			if (try_log_flush) {
> 				_xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
> 				try_log_flush = false;
> 				goto restart;
> 			}
> 
> 			if (busy_gen == READ_ONCE(pag->pagb_gen))
> 				return -EAGAIN;
> 
> 			/* XXX should we set try_log_flush = true? */
> 			goto restart;
> 		}
> 
> 		xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
> 		goto restart;
> 	}
> 
> IOWs, I think Dave wants us to keep the changes in the allocator instead
> of spreading it around.

Sort of - I want the busy extent flush code to be isolated inside
xfs_extent_busy_flush(), not spread around the allocator. :)

xfs_extent_busy_flush(
	struct xfs_trans	*tp,
	struct xfs_perag	*pag,
	unsigned int		busy_gen,
	unsigned int		flags)
{
	error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
	if (error)
		return error;

	/*
	 * If we are holding busy extents, the caller may not want
	 * to block straight away. If we are being told just to try
	 * a flush or progress has been made since we last skipped a busy
	 * extent, return immediately to allow the caller to try
	 * again. If we are freeing extents, we might actually be
	 * holding the onyly free extents in the transaction busy
	 * list and the log force won't resolve that situation. In
	 * this case, return -EAGAIN in that case to tell the caller
	 * it needs to commit the busy extents it holds before
	 * retrying the extent free operation.
	 */
	if (!list_empty(&tp->t_busy_list)) {
		if (flags & XFS_ALLOC_FLAG_TRY_FLUSH)
			return 0;
		if (busy_gen != READ_ONCE(pag->pagb_gen))
			return 0;
		if (flags & XFS_ALLOC_FLAG_FREEING)
			return -EAGAIN;
	}

	/* wait for progressing resolving busy extents */
	do {
		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
		if  (busy_gen != READ_ONCE(pag->pagb_gen))
			break;
		schedule();
	} while (1);

	finish_wait(&pag->pagb_wait, &wait);
	return 0;
}

It seems cleaner to me to put this all in xfs_extent_busy_flush()
rather than having open-coded handling of extent free constraints in
each potential flush location. We already have retry semantics
around the flush, let's just extend them slightly....

-Dave.

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-17  1:40             ` Dave Chinner
@ 2023-05-17  1:54               ` Darrick J. Wong
  2023-05-17 19:41                 ` Wengang Wang
  2023-05-17 19:14               ` Wengang Wang
  1 sibling, 1 reply; 16+ messages in thread
From: Darrick J. Wong @ 2023-05-17  1:54 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Wengang Wang, linux-xfs@vger.kernel.org

On Wed, May 17, 2023 at 11:40:05AM +1000, Dave Chinner wrote:
> On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
> > Since 6.3 we got rid of the _THIS_AG indirection stuff and that becomes:
> > 
> > xfs_alloc_fix_freelist ->
> > xfs_alloc_ag_vextent_size ->
> > (run all the way to the end of the bnobt) ->
> > xfs_extent_busy_flush ->
> > <stall on the busy extent that's in @tp->busy_list>
> > 
> > xfs_extent_busy_flush does this, potentially while we're holding the
> > freed extent in @tp->t_busy_list:
> > 
> > 	error = xfs_log_force(mp, XFS_LOG_SYNC);
> > 	if (error)
> > 		return;
> > 
> > 	do {
> > 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> > 		if  (busy_gen != READ_ONCE(pag->pagb_gen))
> > 			break;
> > 		schedule();
> > 	} while (1);
> > 
> > 	finish_wait(&pag->pagb_wait, &wait);
> > 
> > The log force kicks the CIL to process whatever other committed items
> > might be lurking in the log.  *Hopefully* someone else freed an extent
> > in the same AG, so the log force has now caused that *other* extent to
> > get processed so it has now cleared the busy list.  Clearing something
> > from the busy list increments the busy generation (aka pagb_gen).
> 
> *nod*
> 
> > Unfortunately, there aren't any other extents, so the busy_gen does not
> > change, and the loop runs forever.
> > 
> > At this point, Dave writes:
> > 
> > [15:57] <dchinner> so if we enter that function with busy extents on the
> > transaction, and we are doing an extent free operation, we should return
> > after the sync log force and not do the generation number wait
> > 
> > [15:58] <dchinner> if we fail to allocate again after the sync log force
> > and the generation number hasn't changed, then return -EAGAIN because no
> > progress has been made.
> > 
> > [15:59] <dchinner> Then the transaction is rolled, the transaction busy
> > list is cleared, and if the next allocation attempt fails becaues
> > everything is busy, we go to sleep waiting for the generation to change
> > 
> > [16:00] <dchinner> but because the transaction does not hold any busy
> > extents, it cannot deadlock here because it does not pin any extents
> > that are in the busy tree....
> > 
> > [16:05] <dchinner> All the generation number is doing here is telling us
> > whether there was busy extent resolution between the time we last
> > skipped a viable extent because it was busy and when the flush
> > completes.
> > 
> > [16:06] <dchinner> it doesn't mean the next allocation will succeed,
> > just that progress has been made so trying the allocation attempt will
> > at least get a different result to the previous scan.
> > 
> > I think the callsites go from this:
> > 
> > 	if (busy) {
> > 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> > 		trace_xfs_alloc_size_busy(args);
> > 		xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
> > 		goto restart;
> > 	}
> 
> I was thinking this code changes to:
> 
> 	flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
> 	....
> 	<attempt allocation>
> 	....
> 	if (busy) {
> 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> 		trace_xfs_alloc_size_busy(args);
> 		error = xfs_extent_busy_flush(args->tp, args->pag,
> 				busy_gen, flags);
> 		if (!error) {
> 			flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
> 			goto restart;
> 		}
> 		/* jump to cleanup exit point */
> 		goto out_error;
> 	}
> 
> Note the different first parameter - we pass args->tp, not args->mp
> so that the flush has access to the transaction context...

<nod>

> > to something like this:
> > 
> > 	bool	try_log_flush = true;
> > 	...
> > restart:
> > 	...
> > 
> > 	if (busy) {
> > 		bool	progress;
> > 
> > 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> > 		trace_xfs_alloc_size_busy(args);
> > 
> > 		/*
> > 		 * If the current transaction has an extent on the busy
> > 		 * list, we're allocating space as part of freeing
> > 		 * space, and all the free space is busy, we can't hang
> > 		 * here forever.  Force the log to try to unbusy free
> > 		 * space that could have been freed by other
> > 		 * transactions, and retry the allocation.  If the
> > 		 * allocation fails a second time because all the free
> > 		 * space is busy and nobody made any progress with
> > 		 * clearing busy extents, return EAGAIN so the caller
> > 		 * can roll this transaction.
> > 		 */
> > 		if ((flags & XFS_ALLOC_FLAG_FREEING) &&
> > 		    !list_empty(&tp->t_busy_list)) {
> > 			int log_flushed;
> > 
> > 			if (try_log_flush) {
> > 				_xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
> > 				try_log_flush = false;
> > 				goto restart;
> > 			}
> > 
> > 			if (busy_gen == READ_ONCE(pag->pagb_gen))
> > 				return -EAGAIN;
> > 
> > 			/* XXX should we set try_log_flush = true? */
> > 			goto restart;
> > 		}
> > 
> > 		xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
> > 		goto restart;
> > 	}
> > 
> > IOWs, I think Dave wants us to keep the changes in the allocator instead
> > of spreading it around.
> 
> Sort of - I want the busy extent flush code to be isolated inside
> xfs_extent_busy_flush(), not spread around the allocator. :)
> 
> xfs_extent_busy_flush(
> 	struct xfs_trans	*tp,
> 	struct xfs_perag	*pag,
> 	unsigned int		busy_gen,
> 	unsigned int		flags)
> {
> 	error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
> 	if (error)
> 		return error;
> 
> 	/*
> 	 * If we are holding busy extents, the caller may not want
> 	 * to block straight away. If we are being told just to try
> 	 * a flush or progress has been made since we last skipped a busy
> 	 * extent, return immediately to allow the caller to try
> 	 * again. If we are freeing extents, we might actually be
> 	 * holding the onyly free extents in the transaction busy

                       only

> 	 * list and the log force won't resolve that situation. In
> 	 * this case, return -EAGAIN in that case to tell the caller
> 	 * it needs to commit the busy extents it holds before
> 	 * retrying the extent free operation.
> 	 */
> 	if (!list_empty(&tp->t_busy_list)) {
> 		if (flags & XFS_ALLOC_FLAG_TRY_FLUSH)
> 			return 0;
> 		if (busy_gen != READ_ONCE(pag->pagb_gen))
> 			return 0;
> 		if (flags & XFS_ALLOC_FLAG_FREEING)
> 			return -EAGAIN;
> 	}

Indeed, that's a lot cleaner.

> 
> 	/* wait for progressing resolving busy extents */
> 	do {
> 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> 		if  (busy_gen != READ_ONCE(pag->pagb_gen))
> 			break;
> 		schedule();
> 	} while (1);
> 
> 	finish_wait(&pag->pagb_wait, &wait);
> 	return 0;
> }
> 
> It seems cleaner to me to put this all in xfs_extent_busy_flush()
> rather than having open-coded handling of extent free constraints in
> each potential flush location. We already have retry semantics
> around the flush, let's just extend them slightly....

<nod> Wengang, how does this sound?

--D

> -Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-17  1:40             ` Dave Chinner
  2023-05-17  1:54               ` Darrick J. Wong
@ 2023-05-17 19:14               ` Wengang Wang
  2023-05-17 22:49                 ` Dave Chinner
  1 sibling, 1 reply; 16+ messages in thread
From: Wengang Wang @ 2023-05-17 19:14 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, linux-xfs@vger.kernel.org



> On May 16, 2023, at 6:40 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
>> Since 6.3 we got rid of the _THIS_AG indirection stuff and that becomes:
>> 
>> xfs_alloc_fix_freelist ->
>> xfs_alloc_ag_vextent_size ->
>> (run all the way to the end of the bnobt) ->
>> xfs_extent_busy_flush ->
>> <stall on the busy extent that's in @tp->busy_list>
>> 
>> xfs_extent_busy_flush does this, potentially while we're holding the
>> freed extent in @tp->t_busy_list:
>> 
>> error = xfs_log_force(mp, XFS_LOG_SYNC);
>> if (error)
>> return;
>> 
>> do {
>> prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
>> if  (busy_gen != READ_ONCE(pag->pagb_gen))
>> break;
>> schedule();
>> } while (1);
>> 
>> finish_wait(&pag->pagb_wait, &wait);
>> 
>> The log force kicks the CIL to process whatever other committed items
>> might be lurking in the log.  *Hopefully* someone else freed an extent
>> in the same AG, so the log force has now caused that *other* extent to
>> get processed so it has now cleared the busy list.  Clearing something
>> from the busy list increments the busy generation (aka pagb_gen).
> 
> *nod*
> 
>> Unfortunately, there aren't any other extents, so the busy_gen does not
>> change, and the loop runs forever.
>> 
>> At this point, Dave writes:
>> 
>> [15:57] <dchinner> so if we enter that function with busy extents on the
>> transaction, and we are doing an extent free operation, we should return
>> after the sync log force and not do the generation number wait
>> 
>> [15:58] <dchinner> if we fail to allocate again after the sync log force
>> and the generation number hasn't changed, then return -EAGAIN because no
>> progress has been made.
>> 
>> [15:59] <dchinner> Then the transaction is rolled, the transaction busy
>> list is cleared, and if the next allocation attempt fails becaues
>> everything is busy, we go to sleep waiting for the generation to change
>> 
>> [16:00] <dchinner> but because the transaction does not hold any busy
>> extents, it cannot deadlock here because it does not pin any extents
>> that are in the busy tree....
>> 
>> [16:05] <dchinner> All the generation number is doing here is telling us
>> whether there was busy extent resolution between the time we last
>> skipped a viable extent because it was busy and when the flush
>> completes.
>> 
>> [16:06] <dchinner> it doesn't mean the next allocation will succeed,
>> just that progress has been made so trying the allocation attempt will
>> at least get a different result to the previous scan.
>> 
>> I think the callsites go from this:
>> 
>> if (busy) {
>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>> trace_xfs_alloc_size_busy(args);
>> xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
>> goto restart;
>> }
> 

Basically I think the following also work by adding complication to allocator
(I didn’t want to do so to leave allocator as simple as possible).

> I was thinking this code changes to:
> 
> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
> ....
> <attempt allocation>
> ....
> if (busy) {
> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> trace_xfs_alloc_size_busy(args);
> error = xfs_extent_busy_flush(args->tp, args->pag,
> busy_gen, flags);
> if (!error) {
> flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;

What’s the benefits to use XFS_ALLOC_FLAG_TRY_FLUSH?
If no change happened to pagb_gen, we would get nothing good in the retry
but waste cycles. Or I missed something?

> goto restart;
> }
> /* jump to cleanup exit point */
> goto out_error;
> }
> 
> Note the different first parameter - we pass args->tp, not args->mp
> so that the flush has access to the transaction context...
> 
>> to something like this:
>> 
>> bool try_log_flush = true;
>> ...
>> restart:
>> ...
>> 
>> if (busy) {
>> bool progress;
>> 
>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>> trace_xfs_alloc_size_busy(args);
>> 
>> /*
>> * If the current transaction has an extent on the busy
>> * list, we're allocating space as part of freeing
>> * space, and all the free space is busy, we can't hang
>> * here forever.  Force the log to try to unbusy free
>> * space that could have been freed by other
>> * transactions, and retry the allocation.  If the
>> * allocation fails a second time because all the free
>> * space is busy and nobody made any progress with
>> * clearing busy extents, return EAGAIN so the caller
>> * can roll this transaction.
>> */
>> if ((flags & XFS_ALLOC_FLAG_FREEING) &&
>>    !list_empty(&tp->t_busy_list)) {
>> int log_flushed;
>> 
>> if (try_log_flush) {
>> _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
>> try_log_flush = false;
>> goto restart;
>> }
>> 
>> if (busy_gen == READ_ONCE(pag->pagb_gen))
>> return -EAGAIN;
>> 
>> /* XXX should we set try_log_flush = true? */
>> goto restart;
>> }
>> 
>> xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
>> goto restart;
>> }
>> 
>> IOWs, I think Dave wants us to keep the changes in the allocator instead
>> of spreading it around.
> 
> Sort of - I want the busy extent flush code to be isolated inside
> xfs_extent_busy_flush(), not spread around the allocator. :)
> 
> xfs_extent_busy_flush(
> struct xfs_trans *tp,
> struct xfs_perag *pag,
> unsigned int busy_gen,
> unsigned int flags)
> {
> error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
> if (error)
> return error;
> 
> /*
> * If we are holding busy extents, the caller may not want
> * to block straight away. If we are being told just to try
> * a flush or progress has been made since we last skipped a busy
> * extent, return immediately to allow the caller to try
> * again. If we are freeing extents, we might actually be
> * holding the onyly free extents in the transaction busy
> * list and the log force won't resolve that situation. In
> * this case, return -EAGAIN in that case to tell the caller
> * it needs to commit the busy extents it holds before
> * retrying the extent free operation.
> */
> if (!list_empty(&tp->t_busy_list)) {
> if (flags & XFS_ALLOC_FLAG_TRY_FLUSH)
> return 0;

I don’t think above is needed. see my previous comment.

thanks,
wengang
> if (busy_gen != READ_ONCE(pag->pagb_gen))
> return 0;
> if (flags & XFS_ALLOC_FLAG_FREEING)
> return -EAGAIN;
> }
> 
> /* wait for progressing resolving busy extents */
> do {
> prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> if  (busy_gen != READ_ONCE(pag->pagb_gen))
> break;
> schedule();
> } while (1);
> 
> finish_wait(&pag->pagb_wait, &wait);
> return 0;
> }
> 
> It seems cleaner to me to put this all in xfs_extent_busy_flush()
> rather than having open-coded handling of extent free constraints in
> each potential flush location. We already have retry semantics
> around the flush, let's just extend them slightly....
> 
> -Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-17  1:54               ` Darrick J. Wong
@ 2023-05-17 19:41                 ` Wengang Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Wengang Wang @ 2023-05-17 19:41 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Dave Chinner, linux-xfs@vger.kernel.org



> On May 16, 2023, at 6:54 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> On Wed, May 17, 2023 at 11:40:05AM +1000, Dave Chinner wrote:
>> On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
>>> Since 6.3 we got rid of the _THIS_AG indirection stuff and that becomes:
>>> 
>>> xfs_alloc_fix_freelist ->
>>> xfs_alloc_ag_vextent_size ->
>>> (run all the way to the end of the bnobt) ->
>>> xfs_extent_busy_flush ->
>>> <stall on the busy extent that's in @tp->busy_list>
>>> 
>>> xfs_extent_busy_flush does this, potentially while we're holding the
>>> freed extent in @tp->t_busy_list:
>>> 
>>> error = xfs_log_force(mp, XFS_LOG_SYNC);
>>> if (error)
>>> return;
>>> 
>>> do {
>>> prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
>>> if  (busy_gen != READ_ONCE(pag->pagb_gen))
>>> break;
>>> schedule();
>>> } while (1);
>>> 
>>> finish_wait(&pag->pagb_wait, &wait);
>>> 
>>> The log force kicks the CIL to process whatever other committed items
>>> might be lurking in the log.  *Hopefully* someone else freed an extent
>>> in the same AG, so the log force has now caused that *other* extent to
>>> get processed so it has now cleared the busy list.  Clearing something
>>> from the busy list increments the busy generation (aka pagb_gen).
>> 
>> *nod*
>> 
>>> Unfortunately, there aren't any other extents, so the busy_gen does not
>>> change, and the loop runs forever.
>>> 
>>> At this point, Dave writes:
>>> 
>>> [15:57] <dchinner> so if we enter that function with busy extents on the
>>> transaction, and we are doing an extent free operation, we should return
>>> after the sync log force and not do the generation number wait
>>> 
>>> [15:58] <dchinner> if we fail to allocate again after the sync log force
>>> and the generation number hasn't changed, then return -EAGAIN because no
>>> progress has been made.
>>> 
>>> [15:59] <dchinner> Then the transaction is rolled, the transaction busy
>>> list is cleared, and if the next allocation attempt fails becaues
>>> everything is busy, we go to sleep waiting for the generation to change
>>> 
>>> [16:00] <dchinner> but because the transaction does not hold any busy
>>> extents, it cannot deadlock here because it does not pin any extents
>>> that are in the busy tree....
>>> 
>>> [16:05] <dchinner> All the generation number is doing here is telling us
>>> whether there was busy extent resolution between the time we last
>>> skipped a viable extent because it was busy and when the flush
>>> completes.
>>> 
>>> [16:06] <dchinner> it doesn't mean the next allocation will succeed,
>>> just that progress has been made so trying the allocation attempt will
>>> at least get a different result to the previous scan.
>>> 
>>> I think the callsites go from this:
>>> 
>>> if (busy) {
>>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>>> trace_xfs_alloc_size_busy(args);
>>> xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
>>> goto restart;
>>> }
>> 
>> I was thinking this code changes to:
>> 
>> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
>> ....
>> <attempt allocation>
>> ....
>> if (busy) {
>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>> trace_xfs_alloc_size_busy(args);
>> error = xfs_extent_busy_flush(args->tp, args->pag,
>> busy_gen, flags);
>> if (!error) {
>> flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
>> goto restart;
>> }
>> /* jump to cleanup exit point */
>> goto out_error;
>> }
>> 
>> Note the different first parameter - we pass args->tp, not args->mp
>> so that the flush has access to the transaction context...
> 
> <nod>
> 
>>> to something like this:
>>> 
>>> bool try_log_flush = true;
>>> ...
>>> restart:
>>> ...
>>> 
>>> if (busy) {
>>> bool progress;
>>> 
>>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>>> trace_xfs_alloc_size_busy(args);
>>> 
>>> /*
>>>  * If the current transaction has an extent on the busy
>>>  * list, we're allocating space as part of freeing
>>>  * space, and all the free space is busy, we can't hang
>>>  * here forever.  Force the log to try to unbusy free
>>>  * space that could have been freed by other
>>>  * transactions, and retry the allocation.  If the
>>>  * allocation fails a second time because all the free
>>>  * space is busy and nobody made any progress with
>>>  * clearing busy extents, return EAGAIN so the caller
>>>  * can roll this transaction.
>>>  */
>>> if ((flags & XFS_ALLOC_FLAG_FREEING) &&
>>>     !list_empty(&tp->t_busy_list)) {
>>> int log_flushed;
>>> 
>>> if (try_log_flush) {
>>> _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
>>> try_log_flush = false;
>>> goto restart;
>>> }
>>> 
>>> if (busy_gen == READ_ONCE(pag->pagb_gen))
>>> return -EAGAIN;
>>> 
>>> /* XXX should we set try_log_flush = true? */
>>> goto restart;
>>> }
>>> 
>>> xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
>>> goto restart;
>>> }
>>> 
>>> IOWs, I think Dave wants us to keep the changes in the allocator instead
>>> of spreading it around.
>> 
>> Sort of - I want the busy extent flush code to be isolated inside
>> xfs_extent_busy_flush(), not spread around the allocator. :)
>> 
>> xfs_extent_busy_flush(
>> struct xfs_trans *tp,
>> struct xfs_perag *pag,
>> unsigned int busy_gen,
>> unsigned int flags)
>> {
>> error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
>> if (error)
>> return error;
>> 
>> /*
>>  * If we are holding busy extents, the caller may not want
>>  * to block straight away. If we are being told just to try
>>  * a flush or progress has been made since we last skipped a busy
>>  * extent, return immediately to allow the caller to try
>>  * again. If we are freeing extents, we might actually be
>>  * holding the onyly free extents in the transaction busy
> 
>                       only
> 
>>  * list and the log force won't resolve that situation. In
>>  * this case, return -EAGAIN in that case to tell the caller
>>  * it needs to commit the busy extents it holds before
>>  * retrying the extent free operation.
>>  */
>> if (!list_empty(&tp->t_busy_list)) {
>> if (flags & XFS_ALLOC_FLAG_TRY_FLUSH)
>> return 0;
>> if (busy_gen != READ_ONCE(pag->pagb_gen))
>> return 0;
>> if (flags & XFS_ALLOC_FLAG_FREEING)
>> return -EAGAIN;
>> }
> 
> Indeed, that's a lot cleaner.
> 
>> 
>> /* wait for progressing resolving busy extents */
>> do {
>> prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
>> if  (busy_gen != READ_ONCE(pag->pagb_gen))
>> break;
>> schedule();
>> } while (1);
>> 
>> finish_wait(&pag->pagb_wait, &wait);
>> return 0;
>> }
>> 
>> It seems cleaner to me to put this all in xfs_extent_busy_flush()
>> rather than having open-coded handling of extent free constraints in
>> each potential flush location. We already have retry semantics
>> around the flush, let's just extend them slightly....
> 
> <nod> Wengang, how does this sound?

Thanks Darrick, pls see my previous reply.
thanks,
wengang


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-17 19:14               ` Wengang Wang
@ 2023-05-17 22:49                 ` Dave Chinner
  2023-05-18  0:10                   ` Wengang Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2023-05-17 22:49 UTC (permalink / raw)
  To: Wengang Wang; +Cc: Darrick J. Wong, linux-xfs@vger.kernel.org

On Wed, May 17, 2023 at 07:14:32PM +0000, Wengang Wang wrote:
> > On May 16, 2023, at 6:40 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
> > I was thinking this code changes to:
> > 
> > flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
> > ....
> > <attempt allocation>
> > ....
> > if (busy) {
> > xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> > trace_xfs_alloc_size_busy(args);
> > error = xfs_extent_busy_flush(args->tp, args->pag,
> > busy_gen, flags);
> > if (!error) {
> > flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
> 
> What’s the benefits to use XFS_ALLOC_FLAG_TRY_FLUSH?
> If no change happened to pagb_gen, we would get nothing good in the retry
> but waste cycles. Or I missed something?

You missed something: the synchronous log force is always done.

The log force is what allows busy extents to be resolved - busy
extents have to be committed to stable storage before they can be
removed from the busy extent tree.

If online discards are not enabled, busy extents are resolved
directly in journal IO completion - the log force waits for this to
occur. In this case, pag->pagb_gen will have already incremented to
indicate progress has been made, and we should never wait in the
loop after the log force. The only time we do that is when the
current transaction holds busy extents itself, and hence if the
current tx holds busy extents we should not wait beyond the log
force....

If online discards are enabled, then they'll be scheduled by journal
IO completion. i.e. waiting on the log force guarntees pending
discards have been scheduled and they'll start completing soon after
the log force returns. When they complete they'll start incrementing
pag->pagb_gen. This is the case the pag->pagb_gen wait loop exists
for - it waits for a discard to complete and resolve the busy extent
in it's IO compeltion routine. At which point the allocation attempt
can restart.

However, the same caveat about the current tx holding
busy extents still exists - we can't tell the difference between
"discards scheduled but not completed" and "no busy extents to
resolve" in the flush code. Hence regardless of the online discard
feature state, we should not be waiting on busy extent generation
changes if we hold busy extents in the transaction....

IOWs, the TRY_FLUSH code reflects the fact that for most users, the
log force resolves the busy extents, not the wait loop on
pag->pagb_gen changing. The wait loop only really kicks in when
online discard is active, and in that case we really do want to
retry allocation without waiting for (potentially very slow)
discards to complete first. We'll do that "wait for discards" the
second time we fail to find a non-busy extent....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-17 22:49                 ` Dave Chinner
@ 2023-05-18  0:10                   ` Wengang Wang
  2023-05-18  1:01                     ` Dave Chinner
  0 siblings, 1 reply; 16+ messages in thread
From: Wengang Wang @ 2023-05-18  0:10 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, linux-xfs@vger.kernel.org

> On May 17, 2023, at 3:49 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Wed, May 17, 2023 at 07:14:32PM +0000, Wengang Wang wrote:
>>> On May 16, 2023, at 6:40 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
>>> I was thinking this code changes to:
>>> 
>>> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
>>> ....
>>> <attempt allocation>
>>> ....
>>> if (busy) {
>>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>>> trace_xfs_alloc_size_busy(args);
>>> error = xfs_extent_busy_flush(args->tp, args->pag,
>>> busy_gen, flags);
>>> if (!error) {
>>> flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
>> 
>> What’s the benefits to use XFS_ALLOC_FLAG_TRY_FLUSH?
>> If no change happened to pagb_gen, we would get nothing good in the retry
>> but waste cycles. Or I missed something?
> 
> You missed something: the synchronous log force is always done.
> 

It’s true that synchronous log force is done.

> The log force is what allows busy extents to be resolved - busy
> extents have to be committed to stable storage before they can be
> removed from the busy extent tree.

Yep.

> 
> If online discards are not enabled, busy extents are resolved
> directly in journal IO completion - the log force waits for this to
> occur. In this case, pag->pagb_gen will have already incremented to
> indicate progress has been made, and we should never wait in the
> loop after the log force. The only time we do that is when the
> current transaction holds busy extents itself, and hence if the
> current tx holds busy extents we should not wait beyond the log
> force....

So you are talking about the case of “pagb_gen will have already incremented”,
In this case your next two lines:

  if (busy_gen != READ_ONCE(pag->pagb_gen))
return 0;

would capture that and return immediately without waiting. So TRY_FLUSH is not
helpful in this case. 

> 
> If online discards are enabled, then they'll be scheduled by journal
> IO completion. i.e. waiting on the log force guarntees pending
> discards have been scheduled and they'll start completing soon after
> the log force returns. When they complete they'll start incrementing
> pag->pagb_gen. This is the case the pag->pagb_gen wait loop exists
> for - it waits for a discard to complete and resolve the busy extent
> in it's IO compeltion routine. At which point the allocation attempt
> can restart.

In above case, you are talking about the case that pag->pagb_gen will be
incremented soon without any blocking.
In this case, in the allocator path, still the two lines:
  if (busy_gen != READ_ONCE(pag->pagb_gen))
return 0;

The condition of "busy_gen != READ_ONCE(pag->pagb_gen)” can be true
or a false according to the race of this process VS the queue workers (which
start completing).  In case it’s true, the code return immediately. Otherwise
the code runs into the loop waiting above condition become true. When
the queue workers incremented pag->pagb_gen, the allocator path jumps
out the loop and go — that’s perfect. I don't see why TRY_FLUSH is needed.

> 
> However, the same caveat about the current tx holding
> busy extents still exists - we can't tell the difference between
> "discards scheduled but not completed" and "no busy extents to
> resolve" in the flush code. Hence regardless of the online discard
> feature state, we should not be waiting on busy extent generation
> changes if we hold busy extents in the transaction....

Yes, that’s true.
But first, without TRY_FLUSH, the code will return -EAGAIN immediately 
with the following two lines:

  if (flags & XFS_ALLOC_FLAG_FREEING)
return -EAGAIN;
without any waiting. So no problem if we don’t use TRY_FLUSH. 

> 
> IOWs, the TRY_FLUSH code reflects the fact that for most users, the
> log force resolves the busy extents, not the wait loop on
> pag->pagb_gen changing. The wait loop only really kicks in when
> online discard is active, and in that case we really do want to
> retry allocation without waiting for (potentially very slow)
> discards to complete first. We'll do that "wait for discards" the
> second time we fail to find a non-busy extent....
> 

It seems that you want give another chance to go over the free extents in AG
to see if we will have luck in the retry without any guarantee. It’s possible
that we are lucky that in the retry, the relevant busy extents are resolved
(cleared), but it’s also possible that we don’t have the luck.

Also the TRY_FLUSH doesn’t look to be a key point to fix the original problem
(deadlock) but an optimization? And that may help only online discards are
enabled (I don’t think that’s a popular mount option, it kills performance greatly).
If you want that anyway, I’d like make it there only when "online discards are
enabled”, otherwise it helps nothing at all but waste cycles.
So change the line of

flags |= XFS_ALLOC_FLAG_TRY_FLUSH;

to

if (xfs_has_discard(mp))
    flags |= XFS_ALLOC_FLAG_TRY_FLUSH;

Sounds good?

thanks,
wengang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-18  0:10                   ` Wengang Wang
@ 2023-05-18  1:01                     ` Dave Chinner
  2023-05-18  1:25                       ` Wengang Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2023-05-18  1:01 UTC (permalink / raw)
  To: Wengang Wang; +Cc: Darrick J. Wong, linux-xfs@vger.kernel.org

On Thu, May 18, 2023 at 12:10:53AM +0000, Wengang Wang wrote:
> 
> 
> > On May 17, 2023, at 3:49 PM, Dave Chinner <david@fromorbit.com> wrote:
> > 
> > On Wed, May 17, 2023 at 07:14:32PM +0000, Wengang Wang wrote:
> >>> On May 16, 2023, at 6:40 PM, Dave Chinner <david@fromorbit.com> wrote:
> >>> On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
> >>> I was thinking this code changes to:
> >>> 
> >>> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
> >>> ....
> >>> <attempt allocation>
> >>> ....
> >>> if (busy) {
> >>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> >>> trace_xfs_alloc_size_busy(args);
> >>> error = xfs_extent_busy_flush(args->tp, args->pag,
> >>> busy_gen, flags);
> >>> if (!error) {
> >>> flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
> >> 
> >> What’s the benefits to use XFS_ALLOC_FLAG_TRY_FLUSH?
> >> If no change happened to pagb_gen, we would get nothing good in the retry
> >> but waste cycles. Or I missed something?
> > 
> > You missed something: the synchronous log force is always done.
> > 
> 
> It’s true that synchronous log force is done.
> 
> > The log force is what allows busy extents to be resolved - busy
> > extents have to be committed to stable storage before they can be
> > removed from the busy extent tree.
> 
> Yep.
> 
> > 
> > If online discards are not enabled, busy extents are resolved
> > directly in journal IO completion - the log force waits for this to
> > occur. In this case, pag->pagb_gen will have already incremented to
> > indicate progress has been made, and we should never wait in the
> > loop after the log force. The only time we do that is when the
> > current transaction holds busy extents itself, and hence if the
> > current tx holds busy extents we should not wait beyond the log
> > force....
> 
> So you are talking about the case of “pagb_gen will have already incremented”,
> In this case your next two lines:
> 
>   if (busy_gen != READ_ONCE(pag->pagb_gen))
> return 0;
> 
> would capture that and return immediately without waiting. So TRY_FLUSH is not
> helpful in this case. 

Except when pag->pagb_gen doesn't change. If it hasn't changed, then
we'd immediately return -EAGAIN without trying again. That is not
what the behaviour we want; we want the allocation to always try
again at least once after a flush has been run, because ....

> > If online discards are enabled, then they'll be scheduled by journal
> > IO completion. i.e. waiting on the log force guarntees pending
> > discards have been scheduled and they'll start completing soon after
> > the log force returns. When they complete they'll start incrementing
> > pag->pagb_gen. This is the case the pag->pagb_gen wait loop exists
> > for - it waits for a discard to complete and resolve the busy extent
> > in it's IO compeltion routine. At which point the allocation attempt
> > can restart.
> 
> In above case, you are talking about the case that pag->pagb_gen will be
> incremented soon without any blocking.
> In this case, in the allocator path, still the two lines:
>   if (busy_gen != READ_ONCE(pag->pagb_gen))
> return 0;
>
> The condition of "busy_gen != READ_ONCE(pag->pagb_gen)” can be true
> or a false according to the race of this process VS the queue workers (which
> start completing).  In case it’s true, the code return immediately. Otherwise
> the code runs into the loop waiting above condition become true. When
> the queue workers incremented pag->pagb_gen, the allocator path jumps
> out the loop and go — that’s perfect. I don't see why TRY_FLUSH is needed.

.... we sample pag->pagb_gen part way through the extent search and
we might sample it multiple times if we encounter multiple busy
extents.  The way the cursor currently works is that it stores the
last busy gen returned. The pag->pagb_gen value might change between
the first busy extent we encounter and the last that we encounter in
the search. If this happens, it implies that some busy extents have
resolved while we have been searching.

In that case, we can enter the flush with busy_gen = pag->pagb_gen,
but that doesn't mean no busy extents have been resolved since we
started the allocation search. Hence the flush itself may not
resolve any new busy extents, but we still may have other busy
extents that were resolved while we were scanning the tree.

Yes, we could change the way we track the busy gen in the cursor,
but that likely has other subtle impacts. e.g. we now have to
consider the fact we may never get a wakeup because all busy extents
in flight have already been resolved before we go to sleep.

It is simpler to always retry the allocation after a minimal flush
to capture busy extents that were resolved while we were scanning
the tree. This is a slow path - we don't care if we burn extra CPU
on a retry that makes no progress because this condition is only
likely to occur when we are trying to do allocation or freeing
extents very near ENOSPC.

The try-flush gives us a mechanism that always does a minimum flush
first before a retry. If that retry fails, then we block, fail or
retry based on whether progress has been made. But we always want
that initial retry before we block, fail or retry...

> So change the line of
>  
> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
> 
> to
> 
> if (xfs_has_discard(mp))
>     flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
> 
> Sounds good?

No. The allocator itself should know nothing about when discards are
enabled - the whole point of the busy extent infrastructure handling
discards is to abstract discard delays away from the allocator
itself. The allocator only cares if the extent is busy or not, it
doesn't care why the extent is busy. And, as per above, the
TRY_FLUSH triggered retry is needed regardless of discards because
busy extents can resolve while a search is in progress instead of
being resolved by the log force...

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions
  2023-05-18  1:01                     ` Dave Chinner
@ 2023-05-18  1:25                       ` Wengang Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Wengang Wang @ 2023-05-18  1:25 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, linux-xfs@vger.kernel.org



> On May 17, 2023, at 6:01 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Thu, May 18, 2023 at 12:10:53AM +0000, Wengang Wang wrote:
>> 
>> 
>>> On May 17, 2023, at 3:49 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> 
>>> On Wed, May 17, 2023 at 07:14:32PM +0000, Wengang Wang wrote:
>>>>> On May 16, 2023, at 6:40 PM, Dave Chinner <david@fromorbit.com> wrote:
>>>>> On Tue, May 16, 2023 at 05:59:13PM -0700, Darrick J. Wong wrote:
>>>>> I was thinking this code changes to:
>>>>> 
>>>>> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
>>>>> ....
>>>>> <attempt allocation>
>>>>> ....
>>>>> if (busy) {
>>>>> xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
>>>>> trace_xfs_alloc_size_busy(args);
>>>>> error = xfs_extent_busy_flush(args->tp, args->pag,
>>>>> busy_gen, flags);
>>>>> if (!error) {
>>>>> flags &= ~XFS_ALLOC_FLAG_TRY_FLUSH;
>>>> 
>>>> What’s the benefits to use XFS_ALLOC_FLAG_TRY_FLUSH?
>>>> If no change happened to pagb_gen, we would get nothing good in the retry
>>>> but waste cycles. Or I missed something?
>>> 
>>> You missed something: the synchronous log force is always done.
>>> 
>> 
>> It’s true that synchronous log force is done.
>> 
>>> The log force is what allows busy extents to be resolved - busy
>>> extents have to be committed to stable storage before they can be
>>> removed from the busy extent tree.
>> 
>> Yep.
>> 
>>> 
>>> If online discards are not enabled, busy extents are resolved
>>> directly in journal IO completion - the log force waits for this to
>>> occur. In this case, pag->pagb_gen will have already incremented to
>>> indicate progress has been made, and we should never wait in the
>>> loop after the log force. The only time we do that is when the
>>> current transaction holds busy extents itself, and hence if the
>>> current tx holds busy extents we should not wait beyond the log
>>> force....
>> 
>> So you are talking about the case of “pagb_gen will have already incremented”,
>> In this case your next two lines:
>> 
>>  if (busy_gen != READ_ONCE(pag->pagb_gen))
>> return 0;
>> 
>> would capture that and return immediately without waiting. So TRY_FLUSH is not
>> helpful in this case. 
> 
> Except when pag->pagb_gen doesn't change. If it hasn't changed, then
> we'd immediately return -EAGAIN without trying again. That is not
> what the behaviour we want; we want the allocation to always try
> again at least once after a flush has been run, because ....
> 
>>> If online discards are enabled, then they'll be scheduled by journal
>>> IO completion. i.e. waiting on the log force guarntees pending
>>> discards have been scheduled and they'll start completing soon after
>>> the log force returns. When they complete they'll start incrementing
>>> pag->pagb_gen. This is the case the pag->pagb_gen wait loop exists
>>> for - it waits for a discard to complete and resolve the busy extent
>>> in it's IO compeltion routine. At which point the allocation attempt
>>> can restart.
>> 
>> In above case, you are talking about the case that pag->pagb_gen will be
>> incremented soon without any blocking.
>> In this case, in the allocator path, still the two lines:
>>  if (busy_gen != READ_ONCE(pag->pagb_gen))
>> return 0;
>> 
>> The condition of "busy_gen != READ_ONCE(pag->pagb_gen)” can be true
>> or a false according to the race of this process VS the queue workers (which
>> start completing).  In case it’s true, the code return immediately. Otherwise
>> the code runs into the loop waiting above condition become true. When
>> the queue workers incremented pag->pagb_gen, the allocator path jumps
>> out the loop and go — that’s perfect. I don't see why TRY_FLUSH is needed.
> 
> .... we sample pag->pagb_gen part way through the extent search and
> we might sample it multiple times if we encounter multiple busy
> extents.  The way the cursor currently works is that it stores the
> last busy gen returned. The pag->pagb_gen value might change between
> the first busy extent we encounter and the last that we encounter in
> the search. If this happens, it implies that some busy extents have
> resolved while we have been searching.
> 
> In that case, we can enter the flush with busy_gen = pag->pagb_gen,
> but that doesn't mean no busy extents have been resolved since we
> started the allocation search. Hence the flush itself may not
> resolve any new busy extents, but we still may have other busy
> extents that were resolved while we were scanning the tree.
> 

Yep, there could be such situations though don’t know frequently that happens.

> Yes, we could change the way we track the busy gen in the cursor,
> but that likely has other subtle impacts. e.g. we now have to
> consider the fact we may never get a wakeup because all busy extents
> in flight have already been resolved before we go to sleep.

Hm.. yep, this could be a future enhancement. 

> 
> It is simpler to always retry the allocation after a minimal flush
> to capture busy extents that were resolved while we were scanning
> the tree. This is a slow path - we don't care if we burn extra CPU
> on a retry that makes no progress because this condition is only
> likely to occur when we are trying to do allocation or freeing
> extents very near ENOSPC.

Agreed.

> 
> The try-flush gives us a mechanism that always does a minimum flush
> first before a retry. If that retry fails, then we block, fail or
> retry based on whether progress has been made. But we always want
> that initial retry before we block, fail or retry...
> 

OK.

>> So change the line of
>> 
>> flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
>> 
>> to
>> 
>> if (xfs_has_discard(mp))
>>    flags |= XFS_ALLOC_FLAG_TRY_FLUSH;
>> 
>> Sounds good?
> 
> No. The allocator itself should know nothing about when discards are
> enabled - the whole point of the busy extent infrastructure handling
> discards is to abstract discard delays away from the allocator
> itself. The allocator only cares if the extent is busy or not, it
> doesn't care why the extent is busy. And, as per above, the
> TRY_FLUSH triggered retry is needed regardless of discards because
> busy extents can resolve while a search is in progress instead of
> being resolved by the log force...

OK, will be with TRY_FLUSH.

thanks,
wengang



^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-05-18  1:25 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-24 22:51 [PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions Wengang Wang
2023-05-02 22:41 ` Wengang Wang
2023-05-12 18:24 ` Darrick J. Wong
2023-05-12 19:55   ` Wengang Wang
2023-05-12 21:13     ` Darrick J. Wong
2023-05-12 22:20       ` Wengang Wang
2023-05-16 16:05         ` Wengang Wang
2023-05-17  0:59           ` Darrick J. Wong
2023-05-17  1:40             ` Dave Chinner
2023-05-17  1:54               ` Darrick J. Wong
2023-05-17 19:41                 ` Wengang Wang
2023-05-17 19:14               ` Wengang Wang
2023-05-17 22:49                 ` Dave Chinner
2023-05-18  0:10                   ` Wengang Wang
2023-05-18  1:01                     ` Dave Chinner
2023-05-18  1:25                       ` Wengang Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).