[PATCH] bcachefs: Fix lock thrashing in __bchfs

Linux bcachefs list
 help / color / mirror / Atom feed

* [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate()
@ 2023-08-03  7:50 Kent Overstreet
  2023-08-04 16:50 ` Brian Foster
  0 siblings, 1 reply; 3+ messages in thread
From: Kent Overstreet @ 2023-08-03  7:50 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

We've observed significant lock thrashing on fstests generic/083 in
fallocate, due to dropping and retaking btree locks when checking the
pagecache for data.

This adds a nonblocking mode to bch2_clamp_data_hole(), where we only
use folio_trylock(), and can thus be used safely while btree locks are
held - thus we only have to drop btree locks as a fallback, on actual
lock contention.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 80 +++++++++++++++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 25 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index dcaf7aad79..d433f4d566 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,7 +35,7 @@
 
 #include <trace/events/writeback.h>
 
-static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
+static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
 
 struct folio_vec {
 	struct folio	*fv_folio;
@@ -3410,11 +3410,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		}
 
 		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-			ret = drop_locks_do(&trans,
-				(bch2_clamp_data_hole(&inode->v,
-						      &hole_start,
-						      &hole_end,
-						      opts.data_replicas), 0));
+			if (bch2_clamp_data_hole(&inode->v,
+						 &hole_start,
+						 &hole_end,
+						 opts.data_replicas, true))
+				ret = drop_locks_do(&trans,
+					(bch2_clamp_data_hole(&inode->v,
+							      &hole_start,
+							      &hole_end,
+							      opts.data_replicas, false), 0));
 			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
 
 			if (ret)
@@ -3714,7 +3718,8 @@ static int folio_data_offset(struct folio *folio, loff_t pos,
 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 				       loff_t start_offset,
 				       loff_t end_offset,
-				       unsigned min_replicas)
+				       unsigned min_replicas,
+				       bool nonblock)
 {
 	struct folio_batch fbatch;
 	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
@@ -3731,7 +3736,13 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			folio_lock(folio);
+			if (!nonblock) {
+				folio_lock(folio);
+			} else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				return -EAGAIN;
+			}
+
 			offset = folio_data_offset(folio,
 					max(folio_pos(folio), start_offset),
 					min_replicas);
@@ -3796,7 +3807,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 
 	if (next_data > offset)
 		next_data = bch2_seek_pagecache_data(&inode->v,
-						     offset, next_data, 0);
+					offset, next_data, 0, false);
 
 	if (next_data >= isize)
 		return -ENXIO;
@@ -3804,18 +3815,24 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
-			      unsigned min_replicas)
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas, bool nonblock)
 {
 	struct folio *folio;
 	struct bch_folio *s;
 	unsigned i, sectors;
 	bool ret = true;
 
-	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
+	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+				    !nonblock ? FGP_LOCK : 0, 0);
 	if (IS_ERR_OR_NULL(folio))
 		return true;
 
+	if (nonblock && !folio_trylock(folio)) {
+		folio_put(folio);
+		return -EAGAIN;
+	}
+
 	s = bch2_folio(folio);
 	if (!s)
 		goto unlock;
@@ -3840,31 +3857,44 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
 				       loff_t start_offset,
 				       loff_t end_offset,
-				       unsigned min_replicas)
+				       unsigned min_replicas,
+				       bool nonblock)
 {
 	struct address_space *mapping = vinode->i_mapping;
 	loff_t offset = start_offset;
 
 	while (offset < end_offset &&
-	       !folio_hole_offset(mapping, &offset, min_replicas))
+	       !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
 		;
 
 	return min(offset, end_offset);
 }
 
-static void bch2_clamp_data_hole(struct inode *inode,
-				 u64 *hole_start,
-				 u64 *hole_end,
-				 unsigned min_replicas)
+static int bch2_clamp_data_hole(struct inode *inode,
+				u64 *hole_start,
+				u64 *hole_end,
+				unsigned min_replicas,
+				bool nonblock)
 {
-	*hole_start = bch2_seek_pagecache_hole(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+	loff_t ret;
+
+	ret = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_start = ret;
 
 	if (*hole_start == *hole_end)
-		return;
+		return 0;
 
-	*hole_end = bch2_seek_pagecache_data(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+	ret = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_end = ret;
+	return 0;
 }
 
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
@@ -3896,12 +3926,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					offset, MAX_LFS_FILESIZE, 0);
+					offset, MAX_LFS_FILESIZE, 0, false);
 			break;
 		} else if (!bkey_extent_is_data(k.k)) {
 			next_hole = bch2_seek_pagecache_hole(&inode->v,
 					max(offset, bkey_start_offset(k.k) << 9),
-					k.k->p.offset << 9, 0);
+					k.k->p.offset << 9, 0, false);
 
 			if (next_hole < k.k->p.offset << 9)
 				break;
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate()
  2023-08-03  7:50 [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate() Kent Overstreet
@ 2023-08-04 16:50 ` Brian Foster
  2023-08-05 16:04   ` Kent Overstreet
  0 siblings, 1 reply; 3+ messages in thread
From: Brian Foster @ 2023-08-04 16:50 UTC (permalink / raw)
  To: Kent Overstreet; +Cc: linux-bcachefs

On Thu, Aug 03, 2023 at 03:50:18AM -0400, Kent Overstreet wrote:
> We've observed significant lock thrashing on fstests generic/083 in
> fallocate, due to dropping and retaking btree locks when checking the
> pagecache for data.
> 
> This adds a nonblocking mode to bch2_clamp_data_hole(), where we only
> use folio_trylock(), and can thus be used safely while btree locks are
> held - thus we only have to drop btree locks as a fallback, on actual
> lock contention.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
>  fs/bcachefs/fs-io.c | 80 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 55 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
> index dcaf7aad79..d433f4d566 100644
> --- a/fs/bcachefs/fs-io.c
> +++ b/fs/bcachefs/fs-io.c
> @@ -35,7 +35,7 @@
>  
>  #include <trace/events/writeback.h>
>  
> -static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
> +static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
>  
>  struct folio_vec {
>  	struct folio	*fv_folio;
> @@ -3410,11 +3410,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
>  		}
>  
>  		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> -			ret = drop_locks_do(&trans,
> -				(bch2_clamp_data_hole(&inode->v,
> -						      &hole_start,
> -						      &hole_end,
> -						      opts.data_replicas), 0));
> +			if (bch2_clamp_data_hole(&inode->v,
> +						 &hole_start,
> +						 &hole_end,
> +						 opts.data_replicas, true))
> +				ret = drop_locks_do(&trans,
> +					(bch2_clamp_data_hole(&inode->v,
> +							      &hole_start,
> +							      &hole_end,
> +							      opts.data_replicas, false), 0));

Nice. I recall that I had tried a quick experiment to just remove the
drop_locks_do() here and did end up producing a lockup, but I forget the
details. Aside from the general btree node locking principle you
mentioned in the previous thread, a brief comment on the hard lock
ordering rule here might be nice for future reference.

>  			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
>  
>  			if (ret)
> @@ -3714,7 +3718,8 @@ static int folio_data_offset(struct folio *folio, loff_t pos,
>  static loff_t bch2_seek_pagecache_data(struct inode *vinode,
>  				       loff_t start_offset,
>  				       loff_t end_offset,
> -				       unsigned min_replicas)
> +				       unsigned min_replicas,
> +				       bool nonblock)
>  {
>  	struct folio_batch fbatch;
>  	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
> @@ -3731,7 +3736,13 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
>  		for (i = 0; i < folio_batch_count(&fbatch); i++) {
>  			struct folio *folio = fbatch.folios[i];
>  
> -			folio_lock(folio);
> +			if (!nonblock) {
> +				folio_lock(folio);
> +			} else if (!folio_trylock(folio)) {
> +				folio_batch_release(&fbatch);
> +				return -EAGAIN;
> +			}
> +
>  			offset = folio_data_offset(folio,
>  					max(folio_pos(folio), start_offset),
>  					min_replicas);
> @@ -3796,7 +3807,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
>  
>  	if (next_data > offset)
>  		next_data = bch2_seek_pagecache_data(&inode->v,
> -						     offset, next_data, 0);
> +					offset, next_data, 0, false);
>  
>  	if (next_data >= isize)
>  		return -ENXIO;
> @@ -3804,18 +3815,24 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
>  	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
>  }
>  
> -static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
> -			      unsigned min_replicas)
> +static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
> +			      unsigned min_replicas, bool nonblock)
>  {
>  	struct folio *folio;
>  	struct bch_folio *s;
>  	unsigned i, sectors;
>  	bool ret = true;
>  
> -	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
> +	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
> +				    !nonblock ? FGP_LOCK : 0, 0);
>  	if (IS_ERR_OR_NULL(folio))
>  		return true;

Looks like a bit of return value wonkiness here.. the function now
returns int, but we return true here, ret is a bool, etc.

Also JFYI it looks like filemap has an FGP_NOWAIT flag that when paired
with FGP_LOCK avoids the need for the extra code here.

Those nits aside this looks pretty good to me. Thanks for fixing it.
I'll have to give it a test (once I'm back most likely, I'm away for a
couple or so more days after today).

Brian

>  
> +	if (nonblock && !folio_trylock(folio)) {
> +		folio_put(folio);
> +		return -EAGAIN;
> +	}
> +
>  	s = bch2_folio(folio);
>  	if (!s)
>  		goto unlock;
> @@ -3840,31 +3857,44 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
>  static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
>  				       loff_t start_offset,
>  				       loff_t end_offset,
> -				       unsigned min_replicas)
> +				       unsigned min_replicas,
> +				       bool nonblock)
>  {
>  	struct address_space *mapping = vinode->i_mapping;
>  	loff_t offset = start_offset;
>  
>  	while (offset < end_offset &&
> -	       !folio_hole_offset(mapping, &offset, min_replicas))
> +	       !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
>  		;
>  
>  	return min(offset, end_offset);
>  }
>  
> -static void bch2_clamp_data_hole(struct inode *inode,
> -				 u64 *hole_start,
> -				 u64 *hole_end,
> -				 unsigned min_replicas)
> +static int bch2_clamp_data_hole(struct inode *inode,
> +				u64 *hole_start,
> +				u64 *hole_end,
> +				unsigned min_replicas,
> +				bool nonblock)
>  {
> -	*hole_start = bch2_seek_pagecache_hole(inode,
> -		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
> +	loff_t ret;
> +
> +	ret = bch2_seek_pagecache_hole(inode,
> +		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
> +	if (ret < 0)
> +		return ret;
> +
> +	*hole_start = ret;
>  
>  	if (*hole_start == *hole_end)
> -		return;
> +		return 0;
>  
> -	*hole_end = bch2_seek_pagecache_data(inode,
> -		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
> +	ret = bch2_seek_pagecache_data(inode,
> +		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
> +	if (ret < 0)
> +		return ret;
> +
> +	*hole_end = ret;
> +	return 0;
>  }
>  
>  static loff_t bch2_seek_hole(struct file *file, u64 offset)
> @@ -3896,12 +3926,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
>  			   BTREE_ITER_SLOTS, k, ret) {
>  		if (k.k->p.inode != inode->v.i_ino) {
>  			next_hole = bch2_seek_pagecache_hole(&inode->v,
> -					offset, MAX_LFS_FILESIZE, 0);
> +					offset, MAX_LFS_FILESIZE, 0, false);
>  			break;
>  		} else if (!bkey_extent_is_data(k.k)) {
>  			next_hole = bch2_seek_pagecache_hole(&inode->v,
>  					max(offset, bkey_start_offset(k.k) << 9),
> -					k.k->p.offset << 9, 0);
> +					k.k->p.offset << 9, 0, false);
>  
>  			if (next_hole < k.k->p.offset << 9)
>  				break;
> -- 
> 2.40.1
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate()
  2023-08-04 16:50 ` Brian Foster
@ 2023-08-05 16:04   ` Kent Overstreet
  0 siblings, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-08-05 16:04 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-bcachefs

On Fri, Aug 04, 2023 at 12:50:53PM -0400, Brian Foster wrote:
> Nice. I recall that I had tried a quick experiment to just remove the
> drop_locks_do() here and did end up producing a lockup, but I forget the
> details. Aside from the general btree node locking principle you
> mentioned in the previous thread, a brief comment on the hard lock
> ordering rule here might be nice for future reference.

*nod* - added that

> > -	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
> > +	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
> > +				    !nonblock ? FGP_LOCK : 0, 0);
> >  	if (IS_ERR_OR_NULL(folio))
> >  		return true;
> 
> Looks like a bit of return value wonkiness here.. the function now
> returns int, but we return true here, ret is a bool, etc.

It looks fine to me; it would definitely make more sense in Rust where
the return value would be Result<Bool, Errcode>, that's just how I'm
thinking about it here, even if it's a bit weird in C.

> Also JFYI it looks like filemap has an FGP_NOWAIT flag that when paired
> with FGP_LOCK avoids the need for the extra code here.

Thanks, added that.

> Those nits aside this looks pretty good to me. Thanks for fixing it.
> I'll have to give it a test (once I'm back most likely, I'm away for a
> couple or so more days after today).

It's been in the master branch for a bit (and the fixup for FGP_NOWAIT
is in the testing branch now), so I think it's pretty well tested at
this point.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-08-05 16:05 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-03  7:50 [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate() Kent Overstreet
2023-08-04 16:50 ` Brian Foster
2023-08-05 16:04   ` Kent Overstreet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox