* [PATCH 1/7] fs: export invalidate_inodes()
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
2025-03-07 14:44 ` Jan Kara
2025-03-07 13:49 ` [PATCH 2/7] bcachefs: Stash a pointer to the filesystem for blk_holder_ops Kent Overstreet
` (5 subsequent siblings)
6 siblings, 1 reply; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel
Cc: Kent Overstreet, Alexander Viro, Christian Brauner, Jan Kara
Needed in bcachefs for implementing blk_holder_ops.mark_dead, since we
can't use the standard fs holder ops (whicth assume a single block device
filesystem).
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/inode.c | 1 +
fs/internal.h | 1 -
include/linux/fs.h | 1 +
3 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/inode.c b/fs/inode.c
index 5587aabdaa5e..6364779a7006 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -939,6 +939,7 @@ void invalidate_inodes(struct super_block *sb)
dispose_list(&dispose);
}
+EXPORT_SYMBOL_GPL(invalidate_inodes);
/*
* Isolate the inode from the LRU in preparation for freeing it.
diff --git a/fs/internal.h b/fs/internal.h
index e7f02ae1e098..7cb515cede3f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -207,7 +207,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
diff --git a/include/linux/fs.h b/include/linux/fs.h
index be3ad155ec9f..5196317598ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3251,6 +3251,7 @@ extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
+extern void invalidate_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);
/*
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH 1/7] fs: export invalidate_inodes()
2025-03-07 13:49 ` [PATCH 1/7] fs: export invalidate_inodes() Kent Overstreet
@ 2025-03-07 14:44 ` Jan Kara
2025-03-07 15:43 ` Kent Overstreet
0 siblings, 1 reply; 10+ messages in thread
From: Jan Kara @ 2025-03-07 14:44 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-bcachefs, linux-kernel, Alexander Viro, Christian Brauner,
Jan Kara
On Fri 07-03-25 08:49:25, Kent Overstreet wrote:
> Needed in bcachefs for implementing blk_holder_ops.mark_dead, since we
> can't use the standard fs holder ops (whicth assume a single block device
> filesystem).
>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Jan Kara <jack@suse.cz>
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Please use evict_inodes(). It is exactly the same and is actually already
exported. Since you are the second one in last month to ask for this let's
clean this up [1].
Honza
[1] https://lore.kernel.org/20250307144318.28120-2-jack@suse.cz
> ---
> fs/inode.c | 1 +
> fs/internal.h | 1 -
> include/linux/fs.h | 1 +
> 3 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 5587aabdaa5e..6364779a7006 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -939,6 +939,7 @@ void invalidate_inodes(struct super_block *sb)
>
> dispose_list(&dispose);
> }
> +EXPORT_SYMBOL_GPL(invalidate_inodes);
>
> /*
> * Isolate the inode from the LRU in preparation for freeing it.
> diff --git a/fs/internal.h b/fs/internal.h
> index e7f02ae1e098..7cb515cede3f 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -207,7 +207,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
> * fs-writeback.c
> */
> extern long get_nr_dirty_inodes(void);
> -void invalidate_inodes(struct super_block *sb);
>
> /*
> * dcache.c
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index be3ad155ec9f..5196317598ac 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3251,6 +3251,7 @@ extern void unlock_new_inode(struct inode *);
> extern void discard_new_inode(struct inode *);
> extern unsigned int get_next_ino(void);
> extern void evict_inodes(struct super_block *sb);
> +extern void invalidate_inodes(struct super_block *sb);
> void dump_mapping(const struct address_space *);
>
> /*
> --
> 2.47.2
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/7] fs: export invalidate_inodes()
2025-03-07 14:44 ` Jan Kara
@ 2025-03-07 15:43 ` Kent Overstreet
0 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 15:43 UTC (permalink / raw)
To: Jan Kara; +Cc: linux-bcachefs, linux-kernel, Alexander Viro, Christian Brauner
On Fri, Mar 07, 2025 at 03:44:50PM +0100, Jan Kara wrote:
> On Fri 07-03-25 08:49:25, Kent Overstreet wrote:
> > Needed in bcachefs for implementing blk_holder_ops.mark_dead, since we
> > can't use the standard fs holder ops (whicth assume a single block device
> > filesystem).
> >
> > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > Cc: Christian Brauner <brauner@kernel.org>
> > Cc: Jan Kara <jack@suse.cz>
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
>
> Please use evict_inodes(). It is exactly the same and is actually already
> exported. Since you are the second one in last month to ask for this let's
> clean this up [1].
Switched it to evict_inodes().
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 2/7] bcachefs: Stash a pointer to the filesystem for blk_holder_ops
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
2025-03-07 13:49 ` [PATCH 1/7] fs: export invalidate_inodes() Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
2025-03-07 13:49 ` [PATCH 3/7] bcachefs: Make sure c->vfs_sb is set before starting fs Kent Overstreet
` (4 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel; +Cc: Kent Overstreet
Note that we open block devices before we allocate bch_fs, but once
attached to a filesystem they will be closed before the bch_fs is torn
down - so stashing a pointer without a refcount looks incorrect but it's
not.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/super-io.c | 2 +-
fs/bcachefs/super.c | 7 +++++++
fs/bcachefs/super_types.h | 8 +++++++-
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 28724aee2c09..bd24b4f7eeb6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -748,7 +748,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
memset(sb, 0, sizeof(*sb));
sb->mode = BLK_OPEN_READ;
sb->have_bio = true;
- sb->holder = kmalloc(1, GFP_KERNEL);
+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
if (!sb->holder)
return -ENOMEM;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 10c281ad96eb..b653dd480591 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1431,6 +1431,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
+ /*
+ * Stash pointer to the filesystem for blk_holder_ops - note that once
+ * attached to a filesystem, we will always close the block device
+ * before tearing down the filesystem object.
+ */
+ ca->disk_sb.holder->c = ca->fs;
+
ca->dev = ca->disk_sb.bdev->bd_dev;
percpu_ref_reinit(&ca->io_ref);
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 368a63d938cf..3a899f799d1d 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -2,13 +2,19 @@
#ifndef _BCACHEFS_SUPER_TYPES_H
#define _BCACHEFS_SUPER_TYPES_H
+struct bch_fs;
+
+struct bch_sb_handle_holder {
+ struct bch_fs *c;
+};
+
struct bch_sb_handle {
struct bch_sb *sb;
struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
- void *holder;
+ struct bch_sb_handle_holder *holder;
size_t buffer_size;
blk_mode_t mode;
unsigned have_layout:1;
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 3/7] bcachefs: Make sure c->vfs_sb is set before starting fs
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
2025-03-07 13:49 ` [PATCH 1/7] fs: export invalidate_inodes() Kent Overstreet
2025-03-07 13:49 ` [PATCH 2/7] bcachefs: Stash a pointer to the filesystem for blk_holder_ops Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
2025-03-07 13:49 ` [PATCH 4/7] bcachefs: Implement blk_holder_ops Kent Overstreet
` (3 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel; +Cc: Kent Overstreet
This is necessary for the new blk_holder_ops, which want the vfs
super_block available for synchronization.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/fs.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2c011a465588..459ca8259fc0 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
bch2_opts_apply(&c->opts, opts);
- ret = bch2_fs_start(c);
- if (ret)
- goto err_stop_fs;
+ /*
+ * need to initialise sb and set c->vfs_sb _before_ starting fs,
+ * for blk_holder_ops
+ */
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
ret = PTR_ERR_OR_ZERO(sb);
@@ -2282,6 +2283,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
sb->s_shrink->seeks = 0;
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err_put_super;
+
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
bch_err_msg(c, ret, "mounting: error getting root inode");
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 4/7] bcachefs: Implement blk_holder_ops
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
` (2 preceding siblings ...)
2025-03-07 13:49 ` [PATCH 3/7] bcachefs: Make sure c->vfs_sb is set before starting fs Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
2025-03-07 13:49 ` [PATCH 5/7] bcachefs: Fix btree_node_scan io_ref handling Kent Overstreet
` (2 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel; +Cc: Kent Overstreet
We can't use the standard fs_holder_ops because they're meant for single
device filesystems - fs_bdev_mark_dead() in particular - and they assume
that the blk_holder is the super_block, which also doesn't work for a
multi device filesystem.
These generally follow the standard fs_holder_ops; the
locking/refcounting is a bit simplified because c->ro_ref suffices, and
bch2_fs_bdev_mark_dead() is not necessarily shutting down the entire
filesystem.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/super-io.c | 3 --
fs/bcachefs/super.c | 97 ++++++++++++++++++++++++++++++++++++++++++
fs/bcachefs/super.h | 2 +
3 files changed, 99 insertions(+), 3 deletions(-)
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bd24b4f7eeb6..dcb69c72555e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -25,9 +25,6 @@
#include <linux/sort.h>
#include <linux/string_choices.h>
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-};
-
struct bch2_metadata_version {
u16 version;
const char *name;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b653dd480591..05a2dc5ef513 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c)
}
set_bit(BCH_FS_started, &c->flags);
+ wake_up(&c->ro_ref_wait);
if (c->opts.read_only) {
bch2_fs_read_only(c);
@@ -2023,6 +2024,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
+/* blk_holder_ops: */
+
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock)
+{
+ struct bch_sb_handle_holder *holder = bdev->bd_holder;
+ struct bch_fs *c = holder->c;
+
+ if (c && !bch2_ro_ref_tryget(c))
+ c = NULL;
+
+ mutex_unlock(&bdev->bd_holder_lock);
+
+ if (c)
+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
+ return c;
+}
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
+{
+ for_each_member_device(c, ca)
+ if (ca->disk_sb.bdev == bdev)
+ return ca;
+ return NULL;
+}
+
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ }
+
+ down_write(&c->state_lock);
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+ if (!ca)
+ goto unlock;
+
+ if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
+ __bch2_dev_offline(c, ca);
+ } else {
+ if (sb) {
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ invalidate_inodes(sb);
+ }
+
+ bch2_journal_flush(&c->journal);
+ bch2_fs_emergency_read_only(c);
+ }
+
+ bch2_dev_put(ca);
+unlock:
+ if (sb)
+ up_read(&sb->s_umount);
+ up_write(&c->state_lock);
+ bch2_ro_ref_put(c);
+}
+
+static void bch2_fs_bdev_sync(struct block_device *bdev)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ sync_filesystem(sb);
+ up_read(&sb->s_umount);
+ }
+
+ bch2_ro_ref_put(c);
+}
+
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+ .mark_dead = bch2_fs_bdev_mark_dead,
+ .sync = bch2_fs_bdev_sync,
+};
+
/* Filesystem open: */
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 04f8287eff5c..23533bce5709 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
+
#endif /* _BCACHEFS_SUPER_H */
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 5/7] bcachefs: Fix btree_node_scan io_ref handling
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
` (3 preceding siblings ...)
2025-03-07 13:49 ` [PATCH 4/7] bcachefs: Implement blk_holder_ops Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
2025-03-07 13:49 ` [PATCH 6/7] bcachefs: bch2_dev_get_ioref() may now sleep Kent Overstreet
2025-03-07 13:49 ` [PATCH 7/7] bcachefs: Implement freeze/thaw Kent Overstreet
6 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel; +Cc: Kent Overstreet
This was completely fubar; it's now simplified a bit as well.
Note that for_each_online_member() takes and releases io_refs as it
iterates, so we need to release that if we break.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/btree_node_scan.c | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index fb73ec77c099..678161321e42 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -270,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref);
closure_put(w->cl);
kfree(w);
return 0;
@@ -289,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- struct task_struct *t;
-
if (!w) {
percpu_ref_put(&ca->io_ref);
ret = -ENOMEM;
goto err;
}
- percpu_ref_get(&ca->io_ref);
- closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
- closure_put(&cl);
- f->ret = ret;
- bch_err(c, "error starting kthread: %i", ret);
+ kfree(w);
+ bch_err_msg(c, ret, "starting kthread");
break;
}
+
+ closure_get(&cl);
+ percpu_ref_get(&ca->io_ref);
+ wake_up_process(t);
}
err:
closure_sync(&cl);
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 6/7] bcachefs: bch2_dev_get_ioref() may now sleep
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
` (4 preceding siblings ...)
2025-03-07 13:49 ` [PATCH 5/7] bcachefs: Fix btree_node_scan io_ref handling Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
2025-03-07 13:49 ` [PATCH 7/7] bcachefs: Implement freeze/thaw Kent Overstreet
6 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel; +Cc: Kent Overstreet
The next patch implementing freezing will change bch2_dev_get_ioref() to
sleep if a device is currently frozen.
Add an annotation and fix the journal code accordingly.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/journal_io.c | 5 ++++-
fs/bcachefs/sb-members.h | 2 ++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index c12d9f9bd536..a510755a8364 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1664,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done)
}
bool completed = false;
+ bool do_discards = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
@@ -1676,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done)
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
- bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
}
@@ -1727,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done)
*/
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
+
+ if (do_discards)
+ bch2_do_discards(c);
}
static void journal_write_endio(struct bio *bio)
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index b29b6c6c21dd..df91b02ce575 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
{
+ might_sleep();
+
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
if (ca && !percpu_ref_tryget(&ca->io_ref))
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 7/7] bcachefs: Implement freeze/thaw
2025-03-07 13:49 [PATCH 0/7] blk_holder_ops, freeze/thaw Kent Overstreet
` (5 preceding siblings ...)
2025-03-07 13:49 ` [PATCH 6/7] bcachefs: bch2_dev_get_ioref() may now sleep Kent Overstreet
@ 2025-03-07 13:49 ` Kent Overstreet
6 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2025-03-07 13:49 UTC (permalink / raw)
To: linux-bcachefs, linux-kernel; +Cc: Kent Overstreet
This fills out our blk_holders_ops with freeze and thaw callbacks, for
shutting down IO (generally during a system suspend).
This is implemented completely differently as on other filesystems
since we have a low level synchronization object which conveniently
works well for us - bch_dev.io_ref, normally used for guarding against a
device being offlined while in use.
bch2_dev_get_ioref() now checks if a freeze is in progress if it fails
to get ca->io_ref, and sleeps until complete and ca->io_ref is alive.
We also need a bit of synchronization for freeze/suspend vs. device
online/offline, done with the new bch_dev.io_ref_statechange_lock.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/bcachefs.h | 3 ++
fs/bcachefs/journal_io.c | 2 +-
fs/bcachefs/sb-members.c | 49 ++++++++++++++++++++++
fs/bcachefs/sb-members.h | 20 +--------
fs/bcachefs/super.c | 87 +++++++++++++++++++++++++++++++++++++---
5 files changed, 136 insertions(+), 25 deletions(-)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d2c3f59a668f..d03aa62907ad 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -526,6 +526,9 @@ struct bch_dev {
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion io_ref_completion;
+ struct mutex io_ref_statechange_lock;
+ unsigned frozen;
+ wait_queue_head_t frozen_wait;
struct bch_fs *fs;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a510755a8364..6979fef5c128 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1769,7 +1769,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
if (!ca) {
/* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
+ bch_err(c, "missing device for journal write");
continue;
}
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 116131f95815..2363367cb32d 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -9,6 +9,55 @@
#include "sb-members.h"
#include "super-io.h"
+/*
+ * Use of bch2_dev_get_ioref() is subject to deadlocks if used incorrectly, and
+ * we cannot write asserts for correct usage, so: pay attention, because this is
+ * where we implement freeze.
+ *
+ * Waiting on an outstanding freeze to complete will indirectly wait on all
+ * other outstanding io_refs to be released. That means:
+ *
+ * - Don't use bch2_dev_get_ioref() if you already have an io_ref, use
+ * percpu_ref_get(). Since dev_get_ioref() has tryget() semantics, that's what
+ * you should be doing anyways.
+ *
+ * - All io_refs must be released without blocking on locks that might be held
+ * while calling dev_get_ioref(). This is easy to obey since we generally
+ * release io_refs from endio functions.
+ *
+ */
+struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+{
+ might_sleep();
+again:
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ if (likely(ca)) {
+ if (unlikely(!percpu_ref_tryget(&ca->io_ref))) {
+ smp_mb();
+ if (ca->frozen) {
+ bch2_dev_get(ca);
+ rcu_read_unlock();
+
+ wait_event(ca->frozen_wait, !ca->frozen);
+ bch2_dev_put(ca);
+ goto again;
+ }
+ ca = NULL;
+ }
+ }
+ rcu_read_unlock();
+
+ if (ca &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ return ca;
+
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
+ return NULL;
+}
+
void bch2_dev_missing(struct bch_fs *c, unsigned dev)
{
if (dev != BCH_SB_MEMBER_INVALID)
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index df91b02ce575..b3359ee63b0e 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -281,25 +281,7 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
return bch2_dev_tryget(c, dev_idx);
}
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
-{
- might_sleep();
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
- ca = NULL;
- rcu_read_unlock();
-
- if (ca &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
- return ca;
-
- if (ca)
- percpu_ref_put(&ca->io_ref);
- return NULL;
-}
+struct bch_dev *bch2_dev_get_ioref(struct bch_fs *, unsigned, int);
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 05a2dc5ef513..dfdeab7d847c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1236,6 +1236,22 @@ static void bch2_dev_free(struct bch_dev *ca)
kobject_put(&ca->kobj);
}
+static void bch2_dev_io_ref_stop(struct bch_dev *ca)
+{
+ lockdep_assert_held(&ca->io_ref_statechange_lock);
+
+ reinit_completion(&ca->io_ref_completion);
+ percpu_ref_kill(&ca->io_ref);
+ wait_for_completion(&ca->io_ref_completion);
+}
+
+static void bch2_dev_io_ref_start(struct bch_dev *ca)
+{
+ lockdep_assert_held(&ca->io_ref_statechange_lock);
+
+ percpu_ref_reinit(&ca->io_ref);
+}
+
static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
{
@@ -1246,13 +1262,14 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->io_ref_completion);
- percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->io_ref_completion);
-
bch2_dev_unlink(ca);
+ mutex_lock(&ca->io_ref_statechange_lock);
+ bch2_dev_io_ref_stop(ca);
+
bch2_free_super(&ca->disk_sb);
+ mutex_unlock(&ca->io_ref_statechange_lock);
+
bch2_dev_journal_exit(ca);
}
@@ -1334,6 +1351,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
+ mutex_init(&ca->io_ref_statechange_lock);
+ init_waitqueue_head(&ca->frozen_wait);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@@ -1428,6 +1447,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
if (ret)
return ret;
+ mutex_lock(&ca->io_ref_statechange_lock);
+
/* Commit: */
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
@@ -1441,7 +1462,9 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->dev = ca->disk_sb.bdev->bd_dev;
- percpu_ref_reinit(&ca->io_ref);
+ if (!ca->frozen)
+ bch2_dev_io_ref_start(ca);
+ mutex_unlock(&ca->io_ref_statechange_lock);
return 0;
}
@@ -2115,9 +2138,63 @@ static void bch2_fs_bdev_sync(struct block_device *bdev)
bch2_ro_ref_put(c);
}
+static int bch2_fs_bdev_freeze(struct block_device *bdev)
+{
+ int ret = -EINVAL;
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return ret;
+
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+ if (!ca)
+ goto err;
+
+ mutex_lock(&ca->io_ref_statechange_lock);
+ ca->frozen++;
+ smp_mb();
+ bch2_dev_io_ref_stop(ca);
+ mutex_unlock(&ca->io_ref_statechange_lock);
+
+ ret = sync_blockdev(bdev);
+
+ bch2_dev_put(ca);
+err:
+ bch2_ro_ref_put(c);
+ return ret;
+}
+
+static int bch2_fs_bdev_thaw(struct block_device *bdev)
+{
+ int ret = -EINVAL;
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return ret;
+
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+ if (!ca)
+ goto err;
+
+ mutex_lock(&ca->io_ref_statechange_lock);
+ if (ca->disk_sb.bdev &&
+ ca->frozen == 1)
+ bch2_dev_io_ref_start(ca);
+ --ca->frozen;
+ wake_up(&ca->frozen_wait);
+ mutex_unlock(&ca->io_ref_statechange_lock);
+
+ ret = 0;
+
+ bch2_dev_put(ca);
+err:
+ bch2_ro_ref_put(c);
+ return ret;
+}
+
const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
.mark_dead = bch2_fs_bdev_mark_dead,
.sync = bch2_fs_bdev_sync,
+ .freeze = bch2_fs_bdev_freeze,
+ .thaw = bch2_fs_bdev_thaw,
};
/* Filesystem open: */
--
2.47.2
^ permalink raw reply related [flat|nested] 10+ messages in thread