[PATCH 0/2] btrfs: fix deadlock with fsync and full sync

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] btrfs: fix deadlock with fsync and full sync
@ 2022-06-01 15:16 Josef Bacik
  2022-06-01 15:16 ` [PATCH 1/2] btrfs: make the return value for log syncing consistent Josef Bacik
  2022-06-01 15:16 ` [PATCH 2/2] btrfs: fix deadlock with fsync+fiemap+full sync Josef Bacik
  0 siblings, 2 replies; 5+ messages in thread
From: Josef Bacik @ 2022-06-01 15:16 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Hello,

We've hit a pretty convoluted deadlock in production that Omar tracked down with
drgn.  I've described the deadlock in the second patch, but generally it's a
lock inversion where we have an existing dependency of extent lock ->
transaction, but in fsync in a few cases we can end up with transaction ->
extent lock, and the expected hilarity ensues.  Thanks,

Josef

Josef Bacik (2):
  btrfs: make the return value for log syncing consistent
  btrfs: fix deadlock with fsync+fiemap+full sync

 fs/btrfs/file.c     | 67 +++++++++++++++++++++++++++++++++++----------
 fs/btrfs/tree-log.c | 18 ++++++------
 fs/btrfs/tree-log.h |  3 ++
 3 files changed, 64 insertions(+), 24 deletions(-)

-- 
2.26.3

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] btrfs: make the return value for log syncing consistent
  2022-06-01 15:16 [PATCH 0/2] btrfs: fix deadlock with fsync and full sync Josef Bacik
@ 2022-06-01 15:16 ` Josef Bacik
  2022-06-01 16:25   ` Filipe Manana
  2022-06-01 15:16 ` [PATCH 2/2] btrfs: fix deadlock with fsync+fiemap+full sync Josef Bacik
  1 sibling, 1 reply; 5+ messages in thread
From: Josef Bacik @ 2022-06-01 15:16 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Currently we will return 1 or -EAGAIN if we decide we need to commit
the transaction rather than sync the log.  In practice this doesn't
really matter, we interpret any !0 and !BTRFS_NO_LOG_SYNC as needing to
commit the transaction.  However this makes it hard to figure out what
the correct thing to do is.  Fix this up by defining
BTRFS_LOG_FORCE_COMMIT and using this in all the places where we want to
force the transaction to be committed.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/tree-log.c | 18 +++++++++---------
 fs/btrfs/tree-log.h |  3 +++
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1201f083d4db..d898ba13285f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -171,7 +171,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		int index = (root->log_transid + 1) % 2;
 
 		if (btrfs_need_log_full_commit(trans)) {
-			ret = -EAGAIN;
+			ret = BTRFS_LOG_FORCE_COMMIT;
 			goto out;
 		}
 
@@ -194,7 +194,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		 * writing.
 		 */
 		if (zoned && !created) {
-			ret = -EAGAIN;
+			ret = BTRFS_LOG_FORCE_COMMIT;
 			goto out;
 		}
 
@@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	/* bail out if we need to do a full commit */
 	if (btrfs_need_log_full_commit(trans)) {
-		ret = -EAGAIN;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
@@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		}
 		btrfs_wait_tree_log_extents(log, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = -EAGAIN;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto out;
 	}
 
@@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		blk_finish_plug(&plug);
 		btrfs_wait_tree_log_extents(log, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = -EAGAIN;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto out_wake_log_root;
 	}
 
@@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	    inode_only == LOG_INODE_ALL &&
 	    inode->last_unlink_trans >= trans->transid) {
 		btrfs_set_log_full_commit(trans);
-		ret = 1;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto out_unlock;
 	}
 
@@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	bool log_dentries = false;
 
 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
-		ret = 1;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto end_no_trans;
 	}
 
 	if (btrfs_root_refs(&root->root_item) == 0) {
-		ret = 1;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto end_no_trans;
 	}
 
@@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
 	if (ret < 0) {
 		btrfs_set_log_full_commit(trans);
-		ret = 1;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 	}
 
 	if (ret)
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1620f8170629..c3baa9d979a9 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -12,6 +12,9 @@
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
+/* we can't use the tree log for whatever reason, force a transaction commit */
+#define BTRFS_LOG_FORCE_COMMIT 1
+
 struct btrfs_log_ctx {
 	int log_ret;
 	int log_transid;
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] btrfs: fix deadlock with fsync+fiemap+full sync
  2022-06-01 15:16 [PATCH 0/2] btrfs: fix deadlock with fsync and full sync Josef Bacik
  2022-06-01 15:16 ` [PATCH 1/2] btrfs: make the return value for log syncing consistent Josef Bacik
@ 2022-06-01 15:16 ` Josef Bacik
  2022-06-01 16:20   ` Filipe Manana
  1 sibling, 1 reply; 5+ messages in thread
From: Josef Bacik @ 2022-06-01 15:16 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

We are hitting the following deadlock in production occasionally

Task 1		Task 2		Task 3		Task 4		Task 5
		fsync(A)
		 start trans
						start commit
				falloc(A)
				 lock 5m-10m
				 start trans
				  wait for commit
fiemap(A)
 lock 0-10m
  wait for 5m-10m
   (have 0-5m locked)

		 have btrfs_need_log_full_commit
		  !full_sync
		  wait_ordered_extents
								finish_ordered_io(A)
								lock 0-5m
								DEADLOCK

We have an existing dependency of file extent lock -> transaction.
However in fsync if we tried to do the fast logging, but then had to
fall back to committing the transaction, we will be forced to call
btrfs_wait_ordered_range() to make sure all of our extents are updated.

This creates a dependency of transaction -> file extent lock, because
btrfs_finish_ordered_io() will need to take the file extent lock in
order to run the ordered extents.

Fix this by stopping the transaction if we have to do the full commit
and we attempted to do the fast logging.  Then attach to the transaction
and commit it if we need to.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/file.c | 67 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fd827b99c1b..9c799731cc70 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
 
-	if (ret != BTRFS_NO_LOG_SYNC) {
+	if (ret == BTRFS_NO_LOG_SYNC) {
+		ret = btrfs_end_transaction(trans);
+		goto out;
+	}
+
+	/* We successfully logged the inode, attempt to sync the log. */
+	if (!ret) {
+		ret = btrfs_sync_log(trans, root, &ctx);
 		if (!ret) {
-			ret = btrfs_sync_log(trans, root, &ctx);
-			if (!ret) {
-				ret = btrfs_end_transaction(trans);
-				goto out;
-			}
-		}
-		if (!full_sync) {
-			ret = btrfs_wait_ordered_range(inode, start, len);
-			if (ret) {
-				btrfs_end_transaction(trans);
-				goto out;
-			}
+			ret = btrfs_end_transaction(trans);
+			goto out;
 		}
-		ret = btrfs_commit_transaction(trans);
-	} else {
+	}
+
+	/*
+	 * At this point we need to commit the transaction because we had
+	 * btrfs_need_log_full_commit() or some other error.
+	 *
+	 * If we didn't do a full sync we have to stop the trans handle, wait on
+	 * the ordered extents, start it again and commit the transaction.  If
+	 * we attempt to wait on the ordered extents here we could deadlock with
+	 * something like fallocate() that is holding the extent lock trying to
+	 * start a transaction while some other thread is trying to commit the
+	 * transaction while we (fsync) are currently holding the transaction
+	 * open.
+	 */
+	if (!full_sync) {
 		ret = btrfs_end_transaction(trans);
+		if (ret)
+			goto out;
+		ret = btrfs_wait_ordered_range(inode, start, len);
+		if (ret)
+			goto out;
+
+		/*
+		 * This is safe to use here because we're only interested in
+		 * making sure the transaction that had the ordered extents is
+		 * committed.  We aren't waiting on anything past this point,
+		 * we're purely getting the transaction and committing it.
+		 */
+		trans = btrfs_attach_transaction_barrier(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+
+			/*
+			 * We committed the transaction and there's no currently
+			 * running transaction, this means everything we care
+			 * about made it to disk and we are done.
+			 */
+			if (ret == -ENOENT)
+				ret = 0;
+			goto out;
+		}
 	}
+
+	ret = btrfs_commit_transaction(trans);
 out:
 	ASSERT(list_empty(&ctx.list));
 	err = file_check_and_advance_wb_err(file);
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] btrfs: fix deadlock with fsync+fiemap+full sync
  2022-06-01 15:16 ` [PATCH 2/2] btrfs: fix deadlock with fsync+fiemap+full sync Josef Bacik
@ 2022-06-01 16:20   ` Filipe Manana
  0 siblings, 0 replies; 5+ messages in thread
From: Filipe Manana @ 2022-06-01 16:20 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Wed, Jun 01, 2022 at 11:16:25AM -0400, Josef Bacik wrote:
> We are hitting the following deadlock in production occasionally
> 
> Task 1		Task 2		Task 3		Task 4		Task 5
> 		fsync(A)
> 		 start trans
> 						start commit
> 				falloc(A)
> 				 lock 5m-10m
> 				 start trans
> 				  wait for commit
> fiemap(A)
>  lock 0-10m
>   wait for 5m-10m
>    (have 0-5m locked)
> 
> 		 have btrfs_need_log_full_commit
> 		  !full_sync
> 		  wait_ordered_extents
> 								finish_ordered_io(A)
> 								lock 0-5m
> 								DEADLOCK
> 
> We have an existing dependency of file extent lock -> transaction.
> However in fsync if we tried to do the fast logging, but then had to
> fall back to committing the transaction, we will be forced to call
> btrfs_wait_ordered_range() to make sure all of our extents are updated.
> 
> This creates a dependency of transaction -> file extent lock, because
> btrfs_finish_ordered_io() will need to take the file extent lock in
> order to run the ordered extents.
> 
> Fix this by stopping the transaction if we have to do the full commit
> and we attempted to do the fast logging.  Then attach to the transaction
> and commit it if we need to.

The subject is confusing, it mentions deadlock with "full fsync". With that
you mean a transaction commit, and not the slow fsync mode (related to the
inode flag BTRFS_INODE_NEEDS_FULL_SYNC). And looking at the diagram and the
code we mention !full_sync, the fast fsync path, so that makes it even more
confusing.

Can we replace "full fsync" with transaction commit (subject)?

> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/file.c | 67 ++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 52 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 1fd827b99c1b..9c799731cc70 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  	 */
>  	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
>  
> -	if (ret != BTRFS_NO_LOG_SYNC) {
> +	if (ret == BTRFS_NO_LOG_SYNC) {
> +		ret = btrfs_end_transaction(trans);
> +		goto out;
> +	}
> +
> +	/* We successfully logged the inode, attempt to sync the log. */
> +	if (!ret) {
> +		ret = btrfs_sync_log(trans, root, &ctx);
>  		if (!ret) {
> -			ret = btrfs_sync_log(trans, root, &ctx);
> -			if (!ret) {
> -				ret = btrfs_end_transaction(trans);
> -				goto out;
> -			}
> -		}
> -		if (!full_sync) {
> -			ret = btrfs_wait_ordered_range(inode, start, len);
> -			if (ret) {
> -				btrfs_end_transaction(trans);
> -				goto out;
> -			}
> +			ret = btrfs_end_transaction(trans);
> +			goto out;
>  		}
> -		ret = btrfs_commit_transaction(trans);
> -	} else {
> +	}
> +
> +	/*
> +	 * At this point we need to commit the transaction because we had
> +	 * btrfs_need_log_full_commit() or some other error.
> +	 *
> +	 * If we didn't do a full sync we have to stop the trans handle, wait on
> +	 * the ordered extents, start it again and commit the transaction.  If
> +	 * we attempt to wait on the ordered extents here we could deadlock with
> +	 * something like fallocate() that is holding the extent lock trying to
> +	 * start a transaction while some other thread is trying to commit the
> +	 * transaction while we (fsync) are currently holding the transaction
> +	 * open.
> +	 */
> +	if (!full_sync) {
>  		ret = btrfs_end_transaction(trans);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_wait_ordered_range(inode, start, len);

We could wait only on the ordered extents collected at ctx.ordered_extents,
like this it would avoid the need to flush and wait for any new writes that
may have started after we unlocked the inode.

There's no helper to just wait for a list of ordered extents, so that's
probably why you made it like that, to make it as short as possible.

But still, something to consider even if for later only.

Otherwise it looks good, thanks.

> +		if (ret)
> +			goto out;
> +
> +		/*
> +		 * This is safe to use here because we're only interested in
> +		 * making sure the transaction that had the ordered extents is
> +		 * committed.  We aren't waiting on anything past this point,
> +		 * we're purely getting the transaction and committing it.
> +		 */
> +		trans = btrfs_attach_transaction_barrier(root);
> +		if (IS_ERR(trans)) {
> +			ret = PTR_ERR(trans);
> +
> +			/*
> +			 * We committed the transaction and there's no currently
> +			 * running transaction, this means everything we care
> +			 * about made it to disk and we are done.
> +			 */
> +			if (ret == -ENOENT)
> +				ret = 0;
> +			goto out;
> +		}
>  	}
> +
> +	ret = btrfs_commit_transaction(trans);
>  out:
>  	ASSERT(list_empty(&ctx.list));
>  	err = file_check_and_advance_wb_err(file);
> -- 
> 2.26.3
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] btrfs: make the return value for log syncing consistent
  2022-06-01 15:16 ` [PATCH 1/2] btrfs: make the return value for log syncing consistent Josef Bacik
@ 2022-06-01 16:25   ` Filipe Manana
  0 siblings, 0 replies; 5+ messages in thread
From: Filipe Manana @ 2022-06-01 16:25 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Wed, Jun 01, 2022 at 11:16:24AM -0400, Josef Bacik wrote:
> Currently we will return 1 or -EAGAIN if we decide we need to commit
> the transaction rather than sync the log.  In practice this doesn't
> really matter, we interpret any !0 and !BTRFS_NO_LOG_SYNC as needing to
> commit the transaction.  However this makes it hard to figure out what
> the correct thing to do is.  Fix this up by defining
> BTRFS_LOG_FORCE_COMMIT and using this in all the places where we want to
> force the transaction to be committed.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/tree-log.c | 18 +++++++++---------
>  fs/btrfs/tree-log.h |  3 +++
>  2 files changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 1201f083d4db..d898ba13285f 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -171,7 +171,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>  		int index = (root->log_transid + 1) % 2;
>  
>  		if (btrfs_need_log_full_commit(trans)) {
> -			ret = -EAGAIN;
> +			ret = BTRFS_LOG_FORCE_COMMIT;
>  			goto out;
>  		}
>  
> @@ -194,7 +194,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>  		 * writing.
>  		 */
>  		if (zoned && !created) {
> -			ret = -EAGAIN;
> +			ret = BTRFS_LOG_FORCE_COMMIT;
>  			goto out;
>  		}
>  
> @@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>  
>  	/* bail out if we need to do a full commit */
>  	if (btrfs_need_log_full_commit(trans)) {
> -		ret = -EAGAIN;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  		mutex_unlock(&root->log_mutex);
>  		goto out;
>  	}
> @@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>  		}
>  		btrfs_wait_tree_log_extents(log, mark);
>  		mutex_unlock(&log_root_tree->log_mutex);
> -		ret = -EAGAIN;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  		goto out;
>  	}
>  
> @@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>  		blk_finish_plug(&plug);
>  		btrfs_wait_tree_log_extents(log, mark);
>  		mutex_unlock(&log_root_tree->log_mutex);
> -		ret = -EAGAIN;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  		goto out_wake_log_root;
>  	}
>  
> @@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
>  	    inode_only == LOG_INODE_ALL &&
>  	    inode->last_unlink_trans >= trans->transid) {
>  		btrfs_set_log_full_commit(trans);
> -		ret = 1;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  		goto out_unlock;
>  	}
>  
> @@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
>  	bool log_dentries = false;
>  
>  	if (btrfs_test_opt(fs_info, NOTREELOG)) {
> -		ret = 1;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  		goto end_no_trans;
>  	}
>  
>  	if (btrfs_root_refs(&root->root_item) == 0) {
> -		ret = 1;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  		goto end_no_trans;
>  	}
>  
> @@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
>  end_trans:
>  	if (ret < 0) {
>  		btrfs_set_log_full_commit(trans);
> -		ret = 1;
> +		ret = BTRFS_LOG_FORCE_COMMIT;
>  	}
>  
>  	if (ret)
> diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
> index 1620f8170629..c3baa9d979a9 100644
> --- a/fs/btrfs/tree-log.h
> +++ b/fs/btrfs/tree-log.h
> @@ -12,6 +12,9 @@
>  /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
>  #define BTRFS_NO_LOG_SYNC 256
>  
> +/* we can't use the tree log for whatever reason, force a transaction commit */
> +#define BTRFS_LOG_FORCE_COMMIT 1

At btrfs_sync_file(), we shoudl also replace 1 with BTRFS_LOG_FORCE_COMMIT.
Otherwise it looks good, thanks.

> +
>  struct btrfs_log_ctx {
>  	int log_ret;
>  	int log_transid;
> -- 
> 2.26.3
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-06-01 16:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-01 15:16 [PATCH 0/2] btrfs: fix deadlock with fsync and full sync Josef Bacik
2022-06-01 15:16 ` [PATCH 1/2] btrfs: make the return value for log syncing consistent Josef Bacik
2022-06-01 16:25   ` Filipe Manana
2022-06-01 15:16 ` [PATCH 2/2] btrfs: fix deadlock with fsync+fiemap+full sync Josef Bacik
2022-06-01 16:20   ` Filipe Manana

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).