* [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate()
@ 2023-08-03 7:50 Kent Overstreet
2023-08-04 16:50 ` Brian Foster
0 siblings, 1 reply; 3+ messages in thread
From: Kent Overstreet @ 2023-08-03 7:50 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
We've observed significant lock thrashing on fstests generic/083 in
fallocate, due to dropping and retaking btree locks when checking the
pagecache for data.
This adds a nonblocking mode to bch2_clamp_data_hole(), where we only
use folio_trylock(), and can thus be used safely while btree locks are
held - thus we only have to drop btree locks as a fallback, on actual
lock contention.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/fs-io.c | 80 +++++++++++++++++++++++++++++++--------------
1 file changed, 55 insertions(+), 25 deletions(-)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index dcaf7aad79..d433f4d566 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,7 +35,7 @@
#include <trace/events/writeback.h>
-static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
+static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
struct folio_vec {
struct folio *fv_folio;
@@ -3410,11 +3410,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
}
if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- ret = drop_locks_do(&trans,
- (bch2_clamp_data_hole(&inode->v,
- &hole_start,
- &hole_end,
- opts.data_replicas), 0));
+ if (bch2_clamp_data_hole(&inode->v,
+ &hole_start,
+ &hole_end,
+ opts.data_replicas, true))
+ ret = drop_locks_do(&trans,
+ (bch2_clamp_data_hole(&inode->v,
+ &hole_start,
+ &hole_end,
+ opts.data_replicas, false), 0));
bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
if (ret)
@@ -3714,7 +3718,8 @@ static int folio_data_offset(struct folio *folio, loff_t pos,
static loff_t bch2_seek_pagecache_data(struct inode *vinode,
loff_t start_offset,
loff_t end_offset,
- unsigned min_replicas)
+ unsigned min_replicas,
+ bool nonblock)
{
struct folio_batch fbatch;
pgoff_t start_index = start_offset >> PAGE_SHIFT;
@@ -3731,7 +3736,13 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- folio_lock(folio);
+ if (!nonblock) {
+ folio_lock(folio);
+ } else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ return -EAGAIN;
+ }
+
offset = folio_data_offset(folio,
max(folio_pos(folio), start_offset),
min_replicas);
@@ -3796,7 +3807,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
if (next_data > offset)
next_data = bch2_seek_pagecache_data(&inode->v,
- offset, next_data, 0);
+ offset, next_data, 0, false);
if (next_data >= isize)
return -ENXIO;
@@ -3804,18 +3815,24 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
}
-static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
- unsigned min_replicas)
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+ unsigned min_replicas, bool nonblock)
{
struct folio *folio;
struct bch_folio *s;
unsigned i, sectors;
bool ret = true;
- folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
+ folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+ !nonblock ? FGP_LOCK : 0, 0);
if (IS_ERR_OR_NULL(folio))
return true;
+ if (nonblock && !folio_trylock(folio)) {
+ folio_put(folio);
+ return -EAGAIN;
+ }
+
s = bch2_folio(folio);
if (!s)
goto unlock;
@@ -3840,31 +3857,44 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
loff_t start_offset,
loff_t end_offset,
- unsigned min_replicas)
+ unsigned min_replicas,
+ bool nonblock)
{
struct address_space *mapping = vinode->i_mapping;
loff_t offset = start_offset;
while (offset < end_offset &&
- !folio_hole_offset(mapping, &offset, min_replicas))
+ !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
;
return min(offset, end_offset);
}
-static void bch2_clamp_data_hole(struct inode *inode,
- u64 *hole_start,
- u64 *hole_end,
- unsigned min_replicas)
+static int bch2_clamp_data_hole(struct inode *inode,
+ u64 *hole_start,
+ u64 *hole_end,
+ unsigned min_replicas,
+ bool nonblock)
{
- *hole_start = bch2_seek_pagecache_hole(inode,
- *hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+ loff_t ret;
+
+ ret = bch2_seek_pagecache_hole(inode,
+ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+ if (ret < 0)
+ return ret;
+
+ *hole_start = ret;
if (*hole_start == *hole_end)
- return;
+ return 0;
- *hole_end = bch2_seek_pagecache_data(inode,
- *hole_start << 9, *hole_end << 9, min_replicas) >> 9;
+ ret = bch2_seek_pagecache_data(inode,
+ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+ if (ret < 0)
+ return ret;
+
+ *hole_end = ret;
+ return 0;
}
static loff_t bch2_seek_hole(struct file *file, u64 offset)
@@ -3896,12 +3926,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0);
+ offset, MAX_LFS_FILESIZE, 0, false);
break;
} else if (!bkey_extent_is_data(k.k)) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0);
+ k.k->p.offset << 9, 0, false);
if (next_hole < k.k->p.offset << 9)
break;
--
2.40.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate()
2023-08-03 7:50 [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate() Kent Overstreet
@ 2023-08-04 16:50 ` Brian Foster
2023-08-05 16:04 ` Kent Overstreet
0 siblings, 1 reply; 3+ messages in thread
From: Brian Foster @ 2023-08-04 16:50 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-bcachefs
On Thu, Aug 03, 2023 at 03:50:18AM -0400, Kent Overstreet wrote:
> We've observed significant lock thrashing on fstests generic/083 in
> fallocate, due to dropping and retaking btree locks when checking the
> pagecache for data.
>
> This adds a nonblocking mode to bch2_clamp_data_hole(), where we only
> use folio_trylock(), and can thus be used safely while btree locks are
> held - thus we only have to drop btree locks as a fallback, on actual
> lock contention.
>
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
> fs/bcachefs/fs-io.c | 80 +++++++++++++++++++++++++++++++--------------
> 1 file changed, 55 insertions(+), 25 deletions(-)
>
> diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
> index dcaf7aad79..d433f4d566 100644
> --- a/fs/bcachefs/fs-io.c
> +++ b/fs/bcachefs/fs-io.c
> @@ -35,7 +35,7 @@
>
> #include <trace/events/writeback.h>
>
> -static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
> +static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
>
> struct folio_vec {
> struct folio *fv_folio;
> @@ -3410,11 +3410,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
> }
>
> if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> - ret = drop_locks_do(&trans,
> - (bch2_clamp_data_hole(&inode->v,
> - &hole_start,
> - &hole_end,
> - opts.data_replicas), 0));
> + if (bch2_clamp_data_hole(&inode->v,
> + &hole_start,
> + &hole_end,
> + opts.data_replicas, true))
> + ret = drop_locks_do(&trans,
> + (bch2_clamp_data_hole(&inode->v,
> + &hole_start,
> + &hole_end,
> + opts.data_replicas, false), 0));
Nice. I recall that I had tried a quick experiment to just remove the
drop_locks_do() here and did end up producing a lockup, but I forget the
details. Aside from the general btree node locking principle you
mentioned in the previous thread, a brief comment on the hard lock
ordering rule here might be nice for future reference.
> bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
>
> if (ret)
> @@ -3714,7 +3718,8 @@ static int folio_data_offset(struct folio *folio, loff_t pos,
> static loff_t bch2_seek_pagecache_data(struct inode *vinode,
> loff_t start_offset,
> loff_t end_offset,
> - unsigned min_replicas)
> + unsigned min_replicas,
> + bool nonblock)
> {
> struct folio_batch fbatch;
> pgoff_t start_index = start_offset >> PAGE_SHIFT;
> @@ -3731,7 +3736,13 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
> for (i = 0; i < folio_batch_count(&fbatch); i++) {
> struct folio *folio = fbatch.folios[i];
>
> - folio_lock(folio);
> + if (!nonblock) {
> + folio_lock(folio);
> + } else if (!folio_trylock(folio)) {
> + folio_batch_release(&fbatch);
> + return -EAGAIN;
> + }
> +
> offset = folio_data_offset(folio,
> max(folio_pos(folio), start_offset),
> min_replicas);
> @@ -3796,7 +3807,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
>
> if (next_data > offset)
> next_data = bch2_seek_pagecache_data(&inode->v,
> - offset, next_data, 0);
> + offset, next_data, 0, false);
>
> if (next_data >= isize)
> return -ENXIO;
> @@ -3804,18 +3815,24 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
> return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
> }
>
> -static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
> - unsigned min_replicas)
> +static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
> + unsigned min_replicas, bool nonblock)
> {
> struct folio *folio;
> struct bch_folio *s;
> unsigned i, sectors;
> bool ret = true;
>
> - folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
> + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
> + !nonblock ? FGP_LOCK : 0, 0);
> if (IS_ERR_OR_NULL(folio))
> return true;
Looks like a bit of return value wonkiness here.. the function now
returns int, but we return true here, ret is a bool, etc.
Also JFYI it looks like filemap has an FGP_NOWAIT flag that when paired
with FGP_LOCK avoids the need for the extra code here.
Those nits aside this looks pretty good to me. Thanks for fixing it.
I'll have to give it a test (once I'm back most likely, I'm away for a
couple or so more days after today).
Brian
>
> + if (nonblock && !folio_trylock(folio)) {
> + folio_put(folio);
> + return -EAGAIN;
> + }
> +
> s = bch2_folio(folio);
> if (!s)
> goto unlock;
> @@ -3840,31 +3857,44 @@ static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
> static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
> loff_t start_offset,
> loff_t end_offset,
> - unsigned min_replicas)
> + unsigned min_replicas,
> + bool nonblock)
> {
> struct address_space *mapping = vinode->i_mapping;
> loff_t offset = start_offset;
>
> while (offset < end_offset &&
> - !folio_hole_offset(mapping, &offset, min_replicas))
> + !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
> ;
>
> return min(offset, end_offset);
> }
>
> -static void bch2_clamp_data_hole(struct inode *inode,
> - u64 *hole_start,
> - u64 *hole_end,
> - unsigned min_replicas)
> +static int bch2_clamp_data_hole(struct inode *inode,
> + u64 *hole_start,
> + u64 *hole_end,
> + unsigned min_replicas,
> + bool nonblock)
> {
> - *hole_start = bch2_seek_pagecache_hole(inode,
> - *hole_start << 9, *hole_end << 9, min_replicas) >> 9;
> + loff_t ret;
> +
> + ret = bch2_seek_pagecache_hole(inode,
> + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
> + if (ret < 0)
> + return ret;
> +
> + *hole_start = ret;
>
> if (*hole_start == *hole_end)
> - return;
> + return 0;
>
> - *hole_end = bch2_seek_pagecache_data(inode,
> - *hole_start << 9, *hole_end << 9, min_replicas) >> 9;
> + ret = bch2_seek_pagecache_data(inode,
> + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
> + if (ret < 0)
> + return ret;
> +
> + *hole_end = ret;
> + return 0;
> }
>
> static loff_t bch2_seek_hole(struct file *file, u64 offset)
> @@ -3896,12 +3926,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
> BTREE_ITER_SLOTS, k, ret) {
> if (k.k->p.inode != inode->v.i_ino) {
> next_hole = bch2_seek_pagecache_hole(&inode->v,
> - offset, MAX_LFS_FILESIZE, 0);
> + offset, MAX_LFS_FILESIZE, 0, false);
> break;
> } else if (!bkey_extent_is_data(k.k)) {
> next_hole = bch2_seek_pagecache_hole(&inode->v,
> max(offset, bkey_start_offset(k.k) << 9),
> - k.k->p.offset << 9, 0);
> + k.k->p.offset << 9, 0, false);
>
> if (next_hole < k.k->p.offset << 9)
> break;
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate()
2023-08-04 16:50 ` Brian Foster
@ 2023-08-05 16:04 ` Kent Overstreet
0 siblings, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-08-05 16:04 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-bcachefs
On Fri, Aug 04, 2023 at 12:50:53PM -0400, Brian Foster wrote:
> Nice. I recall that I had tried a quick experiment to just remove the
> drop_locks_do() here and did end up producing a lockup, but I forget the
> details. Aside from the general btree node locking principle you
> mentioned in the previous thread, a brief comment on the hard lock
> ordering rule here might be nice for future reference.
*nod* - added that
> > - folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
> > + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
> > + !nonblock ? FGP_LOCK : 0, 0);
> > if (IS_ERR_OR_NULL(folio))
> > return true;
>
> Looks like a bit of return value wonkiness here.. the function now
> returns int, but we return true here, ret is a bool, etc.
It looks fine to me; it would definitely make more sense in Rust where
the return value would be Result<Bool, Errcode>, that's just how I'm
thinking about it here, even if it's a bit weird in C.
> Also JFYI it looks like filemap has an FGP_NOWAIT flag that when paired
> with FGP_LOCK avoids the need for the extra code here.
Thanks, added that.
> Those nits aside this looks pretty good to me. Thanks for fixing it.
> I'll have to give it a test (once I'm back most likely, I'm away for a
> couple or so more days after today).
It's been in the master branch for a bit (and the fixup for FGP_NOWAIT
is in the testing branch now), so I think it's pretty well tested at
this point.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-08-05 16:05 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-03 7:50 [PATCH] bcachefs: Fix lock thrashing in __bchfs_fallocate() Kent Overstreet
2023-08-04 16:50 ` Brian Foster
2023-08-05 16:04 ` Kent Overstreet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox