* [PATCH 0/6] rebalance_work btree
@ 2023-10-24 19:14 Kent Overstreet
2023-10-24 19:14 ` [PATCH 1/6] bcachefs: move.c exports, refactoring Kent Overstreet
` (5 more replies)
0 siblings, 6 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
Here's the rebalance_work patchset, which I expect to merge shortly.
It eliminates most scanning by the rebalance thread, which is a nice
scalabality improvement. Cheers :)
Kent Overstreet (6):
bcachefs: move.c exports, refactoring
bcachefs: moving_context now owns a btree_trans
bcachefs: move: convert to bbpos
bcachefs: move: move_stats refactoring
bcachefs: bch2_inum_opts_get()
bcachefs: rebalance_work
fs/bcachefs/bbpos.h | 14 +-
fs/bcachefs/bbpos_types.h | 18 ++
fs/bcachefs/bcachefs.h | 4 +-
fs/bcachefs/bcachefs_format.h | 34 +--
fs/bcachefs/buckets.c | 10 +
fs/bcachefs/chardev.c | 4 +-
fs/bcachefs/compress.c | 18 +-
fs/bcachefs/compress.h | 2 +
fs/bcachefs/data_update.c | 21 +-
fs/bcachefs/data_update.h | 1 +
fs/bcachefs/extents.c | 155 +++++++++-
fs/bcachefs/extents.h | 20 ++
fs/bcachefs/inode.c | 12 +
fs/bcachefs/inode.h | 1 +
fs/bcachefs/io_misc.c | 11 +-
fs/bcachefs/io_write.c | 20 +-
fs/bcachefs/move.c | 277 ++++++++---------
fs/bcachefs/move.h | 36 ++-
fs/bcachefs/move_types.h | 8 +-
fs/bcachefs/movinggc.c | 37 ++-
fs/bcachefs/rebalance.c | 553 ++++++++++++++++++++--------------
fs/bcachefs/rebalance.h | 9 +-
fs/bcachefs/rebalance_types.h | 31 +-
fs/bcachefs/recovery.c | 1 +
fs/bcachefs/recovery_types.h | 1 +
fs/bcachefs/reflink.c | 19 +-
fs/bcachefs/sysfs.c | 14 +-
fs/bcachefs/trace.c | 1 +
fs/bcachefs/trace.h | 31 +-
fs/bcachefs/xattr.c | 2 +-
30 files changed, 848 insertions(+), 517 deletions(-)
create mode 100644 fs/bcachefs/bbpos_types.h
--
2.42.0
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 1/6] bcachefs: move.c exports, refactoring
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
@ 2023-10-24 19:14 ` Kent Overstreet
2023-10-24 19:14 ` [PATCH 2/6] bcachefs: moving_context now owns a btree_trans Kent Overstreet
` (4 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
Prep work for the new rebalance code - we need a few helpers exported.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/move.c | 119 +++++++++++++++++++++-------------------
fs/bcachefs/move.h | 22 +++++++-
fs/bcachefs/rebalance.c | 3 +-
3 files changed, 85 insertions(+), 59 deletions(-)
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 38b076ff1906..12167791e34c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -171,8 +171,8 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
}
}
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+ struct btree_trans *trans)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
@@ -287,14 +287,13 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
-static int bch2_move_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bch_io_opts io_opts,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct data_update_opts data_opts)
+int bch2_move_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct bch_io_opts io_opts,
+ struct bkey_s_c k,
+ struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -356,7 +355,7 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, btree_id, k);
+ io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
@@ -371,6 +370,9 @@ static int bch2_move_extent(struct btree_trans *trans,
io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
if (ctxt->stats) {
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -400,7 +402,7 @@ static int bch2_move_extent(struct btree_trans *trans,
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->rbio,
bkey_start_pos(k.k),
- btree_id, k, 0,
+ iter->btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@@ -464,9 +466,9 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
return &io_opts->fs_io_opts;
}
-static int bch2_move_get_io_opts_one(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct bkey_s_c extent_k)
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c extent_k)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -497,8 +499,8 @@ static int bch2_move_get_io_opts_one(struct btree_trans *trans,
return 0;
}
-static int move_ratelimit(struct btree_trans *trans,
- struct moving_context *ctxt)
+int bch2_move_ratelimit(struct btree_trans *trans,
+ struct moving_context *ctxt)
{
struct bch_fs *c = trans->c;
u64 delay;
@@ -545,7 +547,8 @@ static int move_ratelimit(struct btree_trans *trans,
return 0;
}
-static int __bch2_move_data(struct moving_context *ctxt,
+static int bch2_move_data_btree(struct btree_trans *trans,
+ struct moving_context *ctxt,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
@@ -555,7 +558,6 @@ static int __bch2_move_data(struct moving_context *ctxt,
struct per_snapshot_io_opts snapshot_io_opts;
struct bch_io_opts *io_opts;
struct bkey_buf sk;
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct data_update_opts data_opts;
@@ -577,7 +579,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
- while (!move_ratelimit(trans, ctxt)) {
+ while (!bch2_move_ratelimit(trans, ctxt)) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -616,7 +618,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
k = bkey_i_to_s_c(sk.k);
ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
- *io_opts, btree_id, k, data_opts);
+ *io_opts, k, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
@@ -630,9 +632,6 @@ static int __bch2_move_data(struct moving_context *ctxt,
/* XXX signal failure */
goto next;
}
-
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@@ -641,48 +640,60 @@ static int __bch2_move_data(struct moving_context *ctxt,
}
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
per_snapshot_io_opts_exit(&snapshot_io_opts);
return ret;
}
-int bch2_move_data(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
+int __bch2_move_data(struct btree_trans *trans,
+ struct moving_context *ctxt,
+ struct bbpos start,
+ struct bbpos end,
+ move_pred_fn pred, void *arg)
{
- struct moving_context ctxt;
+ struct bch_fs *c = trans->c;
enum btree_id id;
int ret = 0;
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+ for (id = start.btree;
+ id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ ctxt->stats->btree_id = id;
- if (id != BTREE_ID_extents &&
- id != BTREE_ID_reflink)
+ if (!btree_type_has_ptrs(id) ||
+ !bch2_btree_id_root(c, id)->b)
continue;
- if (!bch2_btree_id_root(c, id)->b)
- continue;
-
- ret = __bch2_move_data(&ctxt,
- id == start_btree_id ? start_pos : POS_MIN,
- id == end_btree_id ? end_pos : POS_MAX,
+ ret = bch2_move_data_btree(trans, ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+
+ struct btree_trans *trans;
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ trans = bch2_trans_get(c);
+ ret = __bch2_move_data(trans, &ctxt, start, end, pred, arg);
+ bch2_trans_put(trans);
bch2_moving_ctxt_exit(&ctxt);
return ret;
@@ -739,7 +750,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
goto err;
}
- while (!(ret = move_ratelimit(trans, ctxt))) {
+ while (!(ret = bch2_move_ratelimit(trans, ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
@@ -791,7 +802,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
ret = bch2_move_extent(trans, &iter, ctxt,
bucket_in_flight,
- io_opts, bp.btree_id, k, data_opts);
+ io_opts, k, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -804,8 +815,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
if (ret)
goto err;
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
} else {
@@ -1087,8 +1096,8 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
@@ -1111,8 +1120,8 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index aa4b65c4f960..67ca13f7e772 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
+#include "bbpos.h"
#include "bcachefs_ioctl.h"
#include "btree_iter.h"
#include "buckets.h"
@@ -61,6 +62,9 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
struct btree_trans *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *,
+ struct btree_trans *);
+int bch2_move_ratelimit(struct btree_trans *, struct moving_context *);
/* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry {
@@ -87,12 +91,26 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+int bch2_move_extent(struct btree_trans *,
+ struct btree_iter *,
+ struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct bch_io_opts,
+ struct bkey_s_c,
+ struct data_update_opts);
+
+int __bch2_move_data(struct btree_trans *,
+ struct moving_context *,
+ struct bbpos,
+ struct bbpos,
+ move_pred_fn, void *);
int bch2_move_data(struct bch_fs *,
- enum btree_id, struct bpos,
- enum btree_id, struct bpos,
+ struct bbpos start,
+ struct bbpos end,
struct bch_ratelimit *,
struct bch_move_stats *,
struct write_point_specifier,
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 568f1e8e7507..92403fa79f1f 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -254,8 +254,7 @@ static int bch2_rebalance_thread(void *arg)
rebalance_work_reset(c);
bch2_move_data(c,
- 0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
+ BBPOS_MIN, BBPOS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
&move_stats,
--
2.42.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/6] bcachefs: moving_context now owns a btree_trans
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
2023-10-24 19:14 ` [PATCH 1/6] bcachefs: move.c exports, refactoring Kent Overstreet
@ 2023-10-24 19:14 ` Kent Overstreet
2023-10-24 19:14 ` [PATCH 3/6] bcachefs: move: convert to bbpos Kent Overstreet
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
btree_trans and moving_context are used together, and having the
moving_context owns the transaction object reduces some plumbing.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/data_update.c | 2 +-
fs/bcachefs/move.c | 93 ++++++++++++++++++---------------------
fs/bcachefs/move.h | 27 +++++-------
fs/bcachefs/movinggc.c | 36 +++++++--------
4 files changed, 70 insertions(+), 88 deletions(-)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 899ff46de8e0..9b42d37dc344 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -487,7 +487,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (ctxt) {
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 12167791e34c..570189eda6fd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -157,13 +157,11 @@ static void move_read_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
- if (trans)
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
list_del(&io->read_list);
@@ -171,21 +169,20 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
}
}
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
- struct bch_fs *c = ctxt->c;
+ struct bch_fs *c = ctxt->trans->c;
- move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
EBUG_ON(atomic_read(&ctxt->write_sectors));
@@ -203,6 +200,9 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
+
+ bch2_trans_put(ctxt->trans);
+ memset(ctxt, 0, sizeof(*ctxt));
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -214,7 +214,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
{
memset(ctxt, 0, sizeof(*ctxt));
- ctxt->c = c;
+ ctxt->trans = bch2_trans_get(c);
ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
@@ -287,14 +287,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
-int bch2_move_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
+int bch2_move_extent(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
- struct bch_io_opts io_opts,
+ struct btree_iter *iter,
struct bkey_s_c k,
+ struct bch_io_opts io_opts,
struct data_update_opts data_opts)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
@@ -499,14 +499,13 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
return 0;
}
-int bch2_move_ratelimit(struct btree_trans *trans,
- struct moving_context *ctxt)
+int bch2_move_ratelimit(struct moving_context *ctxt)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = ctxt->trans->c;
u64 delay;
if (ctxt->wait_on_copygc) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
@@ -516,7 +515,7 @@ int bch2_move_ratelimit(struct btree_trans *trans,
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
if (delay) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -529,7 +528,7 @@ int bch2_move_ratelimit(struct btree_trans *trans,
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
@@ -538,7 +537,7 @@ int bch2_move_ratelimit(struct btree_trans *trans,
* XXX: these limits really ought to be per device, SSDs and hard drives
* will want different limits
*/
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@@ -547,14 +546,14 @@ int bch2_move_ratelimit(struct btree_trans *trans,
return 0;
}
-static int bch2_move_data_btree(struct btree_trans *trans,
- struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
+static int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
{
- struct bch_fs *c = ctxt->c;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct per_snapshot_io_opts snapshot_io_opts;
struct bch_io_opts *io_opts;
struct bkey_buf sk;
@@ -579,7 +578,7 @@ static int bch2_move_data_btree(struct btree_trans *trans,
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
- while (!bch2_move_ratelimit(trans, ctxt)) {
+ while (!bch2_move_ratelimit(ctxt)) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -617,15 +616,14 @@ static int bch2_move_data_btree(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
- *io_opts, k, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
@@ -646,13 +644,12 @@ static int bch2_move_data_btree(struct btree_trans *trans,
return ret;
}
-int __bch2_move_data(struct btree_trans *trans,
- struct moving_context *ctxt,
+int __bch2_move_data(struct moving_context *ctxt,
struct bbpos start,
struct bbpos end,
move_pred_fn pred, void *arg)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = ctxt->trans->c;
enum btree_id id;
int ret = 0;
@@ -665,7 +662,7 @@ int __bch2_move_data(struct btree_trans *trans,
!bch2_btree_id_root(c, id)->b)
continue;
- ret = bch2_move_data_btree(trans, ctxt,
+ ret = bch2_move_data_btree(ctxt,
id == start.btree ? start.pos : POS_MIN,
id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
@@ -686,26 +683,23 @@ int bch2_move_data(struct bch_fs *c,
move_pred_fn pred, void *arg)
{
- struct btree_trans *trans;
struct moving_context ctxt;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- trans = bch2_trans_get(c);
- ret = __bch2_move_data(trans, &ctxt, start, end, pred, arg);
- bch2_trans_put(trans);
+ ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-int __bch2_evacuate_bucket(struct btree_trans *trans,
- struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
{
- struct bch_fs *c = ctxt->c;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
struct bkey_buf sk;
@@ -750,7 +744,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
goto err;
}
- while (!(ret = bch2_move_ratelimit(trans, ctxt))) {
+ while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
@@ -800,16 +794,15 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
i++;
}
- ret = bch2_move_extent(trans, &iter, ctxt,
- bucket_in_flight,
- io_opts, k, data_opts);
+ ret = bch2_move_extent(ctxt, bucket_in_flight,
+ &iter, k, io_opts, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
if (ret)
@@ -865,14 +858,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
struct write_point_specifier wp,
bool wait_on_copygc)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
+ ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
bch2_moving_ctxt_exit(&ctxt);
- bch2_trans_put(trans);
return ret;
}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 67ca13f7e772..39e762b103ca 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -12,7 +12,7 @@
struct bch_read_bio;
struct moving_context {
- struct bch_fs *c;
+ struct btree_trans *trans;
struct list_head list;
void *fn;
@@ -38,10 +38,10 @@ struct moving_context {
wait_queue_head_t wait;
};
-#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
+#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
+ bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
@@ -60,11 +60,9 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
- struct btree_trans *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *,
- struct btree_trans *);
-int bch2_move_ratelimit(struct btree_trans *, struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
/* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry {
@@ -95,16 +93,14 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
-int bch2_move_extent(struct btree_trans *,
- struct btree_iter *,
- struct moving_context *,
+int bch2_move_extent(struct moving_context *,
struct move_bucket_in_flight *,
- struct bch_io_opts,
+ struct btree_iter *,
struct bkey_s_c,
+ struct bch_io_opts,
struct data_update_opts);
-int __bch2_move_data(struct btree_trans *,
- struct moving_context *,
+int __bch2_move_data(struct moving_context *,
struct bbpos,
struct bbpos,
move_pred_fn, void *);
@@ -117,8 +113,7 @@ int bch2_move_data(struct bch_fs *,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct btree_trans *,
- struct moving_context *,
+int __bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4017120baeee..a2862e322658 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret;
}
-static void move_buckets_wait(struct btree_trans *trans,
- struct moving_context *ctxt,
+static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
@@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
while ((i = list->first)) {
if (flush)
- move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+ move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
if (atomic_read(&i->count))
break;
@@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
kfree(i);
}
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
}
static bool bucket_in_flight(struct buckets_in_flight *list,
@@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
typedef DARRAY(struct move_bucket) move_buckets;
-static int bch2_copygc_get_buckets(struct btree_trans *trans,
- struct moving_context *ctxt,
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
@@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
- move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+ move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@@ -188,10 +187,10 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
}
noinline
-static int bch2_copygc(struct btree_trans *trans,
- struct moving_context *ctxt,
+static int bch2_copygc(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
@@ -202,7 +201,7 @@ static int bch2_copygc(struct btree_trans *trans,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
@@ -221,7 +220,7 @@ static int bch2_copygc(struct btree_trans *trans,
break;
}
- ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+ ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
@@ -300,7 +299,6 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
- struct btree_trans *trans;
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
@@ -317,7 +315,6 @@ static int bch2_copygc_thread(void *arg)
}
set_freezable();
- trans = bch2_trans_get(c);
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -325,16 +322,16 @@ static int bch2_copygc_thread(void *arg)
false);
while (!ret && !kthread_should_stop()) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt.trans);
cond_resched();
if (!c->copy_gc_enabled) {
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
kthread_wait_freezable(c->copy_gc_enabled);
}
if (unlikely(freezing(current))) {
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
__refrigerator(false);
continue;
}
@@ -345,7 +342,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@@ -355,15 +352,14 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
- ret = bch2_copygc(trans, &ctxt, &buckets);
+ ret = bch2_copygc(&ctxt, &buckets);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
}
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
rhashtable_destroy(&buckets.table);
- bch2_trans_put(trans);
bch2_moving_ctxt_exit(&ctxt);
return 0;
--
2.42.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 3/6] bcachefs: move: convert to bbpos
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
2023-10-24 19:14 ` [PATCH 1/6] bcachefs: move.c exports, refactoring Kent Overstreet
2023-10-24 19:14 ` [PATCH 2/6] bcachefs: moving_context now owns a btree_trans Kent Overstreet
@ 2023-10-24 19:14 ` Kent Overstreet
2023-10-24 19:14 ` [PATCH 4/6] bcachefs: move: move_stats refactoring Kent Overstreet
` (2 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/bbpos.h | 14 +-------------
fs/bcachefs/bbpos_types.h | 18 ++++++++++++++++++
fs/bcachefs/chardev.c | 4 ++--
fs/bcachefs/data_update.c | 8 +++++---
fs/bcachefs/data_update.h | 1 +
fs/bcachefs/move.c | 19 ++++++++-----------
fs/bcachefs/move_types.h | 5 +++--
7 files changed, 38 insertions(+), 31 deletions(-)
create mode 100644 fs/bcachefs/bbpos_types.h
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
index 0038bc28ba8c..be2edced5213 100644
--- a/fs/bcachefs/bbpos.h
+++ b/fs/bcachefs/bbpos.h
@@ -2,22 +2,10 @@
#ifndef _BCACHEFS_BBPOS_H
#define _BCACHEFS_BBPOS_H
+#include "bbpos_types.h"
#include "bkey_methods.h"
#include "btree_cache.h"
-struct bbpos {
- enum btree_id btree;
- struct bpos pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
- return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
-
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
new file mode 100644
index 000000000000..5198e94cf3b8
--- /dev/null
+++ b/fs/bcachefs/bbpos_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+ enum btree_id btree;
+ struct bpos pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+ return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN BBPOS(0, POS_MIN)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index f69e15dc699c..4bb88aefed12 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.btree_id,
- .p.pos = ctx->stats.pos,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
};
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 9b42d37dc344..e445c441764c 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -281,11 +281,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
}
continue;
nowork:
- if (m->ctxt && m->ctxt->stats) {
+ if (m->stats && m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
+ &m->stats->sectors_raced);
}
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@@ -439,6 +439,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
+ m->ctxt = ctxt;
+ m->stats = ctxt->stats;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 7ca1f98d7e94..9dc17b9d8379 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -23,6 +23,7 @@ struct data_update {
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
+ struct bch_move_stats *stats;
struct bch_write_op op;
};
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 570189eda6fd..753755a627d5 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -367,7 +367,6 @@ int bch2_move_extent(struct moving_context *ctxt,
BUG_ON(ret);
- io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
if (ctxt->rate)
@@ -567,8 +566,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
if (ctxt->stats) {
ctxt->stats->data_type = BCH_DATA_user;
- ctxt->stats->btree_id = btree_id;
- ctxt->stats->pos = start;
+ ctxt->stats->pos = BBPOS(btree_id, start);
}
bch2_trans_iter_init(trans, &iter, btree_id, start,
@@ -595,7 +593,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
break;
if (ctxt->stats)
- ctxt->stats->pos = iter.pos;
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
@@ -656,7 +654,7 @@ int __bch2_move_data(struct moving_context *ctxt,
for (id = start.btree;
id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- ctxt->stats->btree_id = id;
+ ctxt->stats->pos = BBPOS(id, POS_MIN);
if (!btree_type_has_ptrs(id) ||
!bch2_btree_id_root(c, id)->b)
@@ -894,7 +892,7 @@ static int bch2_move_btree(struct bch_fs *c,
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ stats->pos = BBPOS(id, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
continue;
@@ -913,7 +911,7 @@ static int bch2_move_btree(struct bch_fs *c,
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
- stats->pos = iter.pos;
+ stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
@@ -1139,10 +1137,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
prt_newline(out);
- prt_printf(out, " data type %s btree_id %s position: ",
- bch2_data_types[stats->data_type],
- bch2_btree_id_str(stats->btree_id));
- bch2_bpos_to_text(out, stats->pos);
+ prt_printf(out, " data type %s position: ",
+ bch2_data_types[stats->data_type]);
+ bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index baf1f8570b3f..f402aa179bbe 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -2,10 +2,11 @@
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
+#include "bbpos_types.h"
+
struct bch_move_stats {
enum bch_data_type data_type;
- enum btree_id btree_id;
- struct bpos pos;
+ struct bbpos pos;
struct list_head list;
char name[32];
--
2.42.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 4/6] bcachefs: move: move_stats refactoring
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
` (2 preceding siblings ...)
2023-10-24 19:14 ` [PATCH 3/6] bcachefs: move: convert to bbpos Kent Overstreet
@ 2023-10-24 19:14 ` Kent Overstreet
2023-10-24 19:14 ` [PATCH 5/6] bcachefs: bch2_inum_opts_get() Kent Overstreet
2023-10-24 19:14 ` [PATCH 6/6] bcachefs: rebalance_work Kent Overstreet
5 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
data_progress_list is gone - it was redundant with moving_context_list
The upcoming rebalance rewrite is going to have it using two different
move_stats objects with the same moving_context, depending on whether
it's scanning or using the rebalance_work btree - this patch plumbs
stats around a bit differently so that will work.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/bcachefs.h | 3 --
fs/bcachefs/data_update.c | 2 +-
fs/bcachefs/move.c | 98 +++++++++++++++++++++------------------
fs/bcachefs/move.h | 5 +-
fs/bcachefs/move_types.h | 3 +-
fs/bcachefs/movinggc.c | 1 +
fs/bcachefs/trace.c | 1 +
fs/bcachefs/trace.h | 31 +++++++++----
8 files changed, 82 insertions(+), 62 deletions(-)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1e0191197de1..bff6324447e1 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -939,9 +939,6 @@ struct bch_fs {
struct list_head moving_context_list;
struct mutex moving_context_lock;
- struct list_head data_progress_list;
- struct mutex data_progress_lock;
-
/* REBALANCE */
struct bch_fs_rebalance rebalance;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index e445c441764c..4860f8293a4f 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -440,7 +440,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->btree_id = btree_id;
m->data_opts = data_opts;
m->ctxt = ctxt;
- m->stats = ctxt->stats;
+ m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 753755a627d5..1b15b010461a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -60,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
}
}
-static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_add(&stats->list, &c->data_progress_list);
- mutex_unlock(&c->data_progress_lock);
-}
-
-static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_del(&stats->list);
- mutex_unlock(&c->data_progress_lock);
-}
-
struct moving_io {
struct list_head read_list;
struct list_head io_list;
@@ -190,13 +176,6 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->read_sectors));
EBUG_ON(atomic_read(&ctxt->read_ios));
- if (ctxt->stats) {
- progress_list_del(c, ctxt->stats);
- trace_move_data(c,
- atomic64_read(&ctxt->stats->sectors_moved),
- atomic64_read(&ctxt->stats->keys_moved));
- }
-
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
@@ -231,16 +210,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
mutex_lock(&c->moving_context_lock);
list_add(&ctxt->list, &c->moving_context_list);
mutex_unlock(&c->moving_context_lock);
+}
- if (stats) {
- progress_list_add(c, stats);
- stats->data_type = BCH_DATA_user;
- }
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+ trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
+ stats->data_type = BCH_DATA_user;
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
@@ -303,6 +283,8 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ if (ctxt->stats)
+ ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -878,14 +860,18 @@ static int bch2_move_btree(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_trans *trans = bch2_trans_get(c);
+ struct moving_context ctxt;
+ struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_update_opts data_opts;
int ret = 0;
- progress_list_add(c, stats);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+ writepoint_ptr(&c->btree_write_point),
+ true);
+ trans = ctxt.trans;
stats->data_type = BCH_DATA_btree;
@@ -933,14 +919,10 @@ static int bch2_move_btree(struct bch_fs *c,
break;
}
- bch2_trans_put(trans);
-
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
+ bch2_moving_ctxt_exit(&ctxt);
bch2_btree_interior_updates_flush(c);
- progress_list_del(c, stats);
return ret;
}
@@ -1061,8 +1043,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
mutex_unlock(&c->sb_lock);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1093,6 +1074,8 @@ int bch2_data_job(struct bch_fs *c,
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
@@ -1117,10 +1100,13 @@ int bch2_data_job(struct bch_fs *c,
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
+ bch2_move_stats_exit(stats, c);
break;
default:
ret = -EINVAL;
@@ -1129,18 +1115,43 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- struct bch_move_stats *stats = ctxt->stats;
- struct moving_io *io;
+ prt_printf(out, "%s: data type=%s pos=",
+ stats->name,
+ bch2_data_types[stats->data_type]);
+ bch2_bbpos_to_text(out, stats->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
- prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
+ prt_str(out, "keys moved: ");
+ prt_u64(out, atomic64_read(&stats->keys_moved));
prt_newline(out);
- prt_printf(out, " data type %s position: ",
- bch2_data_types[stats->data_type]);
- bch2_bbpos_to_text(out, stats->pos);
+ prt_str(out, "keys raced: ");
+ prt_u64(out, atomic64_read(&stats->keys_raced));
+ prt_newline(out);
+
+ prt_str(out, "bytes seen: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+ prt_newline(out);
+
+ prt_str(out, "bytes moved: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
+
+ prt_str(out, "bytes raced: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
prt_printf(out, "reads: ios %u/%u sectors %u/%u",
@@ -1181,7 +1192,4 @@ void bch2_fs_move_init(struct bch_fs *c)
{
INIT_LIST_HEAD(&c->moving_context_list);
mutex_init(&c->moving_context_lock);
-
- INIT_LIST_HEAD(&c->data_progress_list);
- mutex_init(&c->data_progress_lock);
}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 39e762b103ca..1b1e8678bfae 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -127,7 +127,10 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_move_init(struct bch_fs *);
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index f402aa179bbe..e22841ef31e4 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -7,13 +7,12 @@
struct bch_move_stats {
enum bch_data_type data_type;
struct bbpos pos;
- struct list_head list;
char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
- atomic64_t sectors_moved;
atomic64_t sectors_seen;
+ atomic64_t sectors_moved;
atomic64_t sectors_raced;
};
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a2862e322658..f73b9b7f4bf7 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -361,6 +361,7 @@ static int bch2_copygc_thread(void *arg)
move_buckets_wait(&ctxt, &buckets, true);
rhashtable_destroy(&buckets.table);
bch2_moving_ctxt_exit(&ctxt);
+ bch2_move_stats_exit(&move_stats, c);
return 0;
}
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index 33efa6005c6f..dc48b52b01b4 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -7,6 +7,7 @@
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
+#include "move_types.h"
#include "opts.h"
#include "six.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 2308f49f3b2e..81f72b2add09 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -767,25 +767,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
);
TRACE_EVENT(move_data,
- TP_PROTO(struct bch_fs *c, u64 sectors_moved,
- u64 keys_moved),
- TP_ARGS(c, sectors_moved, keys_moved),
+ TP_PROTO(struct bch_fs *c,
+ struct bch_move_stats *stats),
+ TP_ARGS(c, stats),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, sectors_moved )
+ __field(dev_t, dev )
__field(u64, keys_moved )
+ __field(u64, keys_raced )
+ __field(u64, sectors_seen )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_raced )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->sectors_moved = sectors_moved;
- __entry->keys_moved = keys_moved;
+ __entry->dev = c->dev;
+ __entry->keys_moved = atomic64_read(&stats->keys_moved);
+ __entry->keys_raced = atomic64_read(&stats->keys_raced);
+ __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
+ __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
+ __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
),
- TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+ TP_printk("%d,%d keys moved %llu raced %llu"
+ "sectors seen %llu moved %llu raced %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->sectors_moved, __entry->keys_moved)
+ __entry->keys_moved,
+ __entry->keys_raced,
+ __entry->sectors_seen,
+ __entry->sectors_moved,
+ __entry->sectors_raced)
);
TRACE_EVENT(evacuate_bucket,
--
2.42.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 5/6] bcachefs: bch2_inum_opts_get()
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
` (3 preceding siblings ...)
2023-10-24 19:14 ` [PATCH 4/6] bcachefs: move: move_stats refactoring Kent Overstreet
@ 2023-10-24 19:14 ` Kent Overstreet
2023-10-24 19:14 ` [PATCH 6/6] bcachefs: rebalance_work Kent Overstreet
5 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
New helper for new rebalance code
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/inode.c | 12 ++++++++++++
fs/bcachefs/inode.h | 1 +
2 files changed, 13 insertions(+)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index a3921c397ea2..23fcd442c514 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -981,6 +981,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
}
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+ struct bch_inode_unpacked inode;
+ int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+ if (ret)
+ return ret;
+
+ bch2_inode_opts_get(opts, trans->c, &inode);
+ return 0;
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index a7464e1b6960..2781e3281583 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -200,6 +200,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
--
2.42.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 6/6] bcachefs: rebalance_work
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
` (4 preceding siblings ...)
2023-10-24 19:14 ` [PATCH 5/6] bcachefs: bch2_inum_opts_get() Kent Overstreet
@ 2023-10-24 19:14 ` Kent Overstreet
2023-11-01 17:02 ` Nathan Chancellor
5 siblings, 1 reply; 10+ messages in thread
From: Kent Overstreet @ 2023-10-24 19:14 UTC (permalink / raw)
To: linux-bcachefs; +Cc: Kent Overstreet
This adds a new btree, rebalance_work, to eliminate scanning required
for finding extents that need work done on them in the background - i.e.
for the background_target and background_compression options.
rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
extent in the extents or reflink btree at the same pos.
A new extent field is added, bch_extent_rebalance, which indicates that
this extent has work that needs to be done in the background - and which
options to use. This allows per-inode options to be propagated to
indirect extents - at least in some circumstances. In this patch,
changing IO options on a file will not propagate the new options to
indirect extents pointed to by that file.
Updating (setting/clearing) the rebalance_work btree is done by the
extent trigger, which looks at the bch_extent_rebalance field.
Scanning is still requrired after changing IO path options - either just
for a given inode, or for the whole filesystem. We indicate that
scanning is required by adding a KEY_TYPE_cookie key to the
rebalance_work btree: the cookie counter is so that we can detect that
scanning is still required when an option has been flipped mid-way
through an existing scan.
Future possible work:
- Propagate options to indirect extents when being changed
- Add other IO path options - nr_replicas, ec, to rebalance_work so
they can be applied in the background when they change
- Add a counter, for bcachefs fs usage output, showing the pending
amount of rebalance work: we'll probably want to do this after the
disk space accounting rewrite (moving it to a new btree)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
fs/bcachefs/bcachefs.h | 1 +
fs/bcachefs/bcachefs_format.h | 34 +--
fs/bcachefs/buckets.c | 10 +
fs/bcachefs/compress.c | 18 +-
fs/bcachefs/compress.h | 2 +
fs/bcachefs/data_update.c | 11 +-
fs/bcachefs/extents.c | 155 +++++++++-
fs/bcachefs/extents.h | 20 ++
fs/bcachefs/io_misc.c | 11 +-
fs/bcachefs/io_write.c | 20 +-
fs/bcachefs/rebalance.c | 552 ++++++++++++++++++++--------------
fs/bcachefs/rebalance.h | 9 +-
fs/bcachefs/rebalance_types.h | 31 +-
fs/bcachefs/recovery.c | 1 +
fs/bcachefs/recovery_types.h | 1 +
fs/bcachefs/reflink.c | 19 +-
fs/bcachefs/sysfs.c | 14 +-
fs/bcachefs/xattr.c | 2 +-
18 files changed, 597 insertions(+), 314 deletions(-)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bff6324447e1..68f0ff03c28a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -464,6 +464,7 @@ enum gc_phase {
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
+ GC_PHASE_BTREE_rebalance_work,
GC_PHASE_PENDING_DELETE,
};
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 99749f3315fe..e04999c57892 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr {
#endif
};
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:22,
- replicas:4,
- generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 generation:32,
- replicas:4,
- unused:22,
- type:6;
-#endif
-};
-
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:7,
- unused:33,
- compression:8,
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
- unused:33,
- type:7;
+ unused:34,
+ type:6;
#endif
};
@@ -1682,7 +1668,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
+ x(rebalance_work, BCH_VERSION(1, 3), \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1693,7 +1681,7 @@ enum bcachefs_metadata_version {
};
static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -2306,7 +2294,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert))
+ BIT_ULL(KEY_TYPE_logged_op_finsert)) \
+ x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5c1eca183243..a8af803e7289 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1536,6 +1536,16 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+ (int) bch2_bkey_needs_rebalance(c, old);
+
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ if (ret)
+ return ret;
+ }
+
return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 0e3981f42526..a8b148ec2a2b 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -697,18 +697,26 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
return ret;
}
-void bch2_opt_compression_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
{
struct bch_compression_opt opt = bch2_compression_decode(v);
- prt_str(out, bch2_compression_opts[opt.type]);
+ if (opt.type < BCH_COMPRESSION_OPT_NR)
+ prt_str(out, bch2_compression_opts[opt.type]);
+ else
+ prt_printf(out, "(unknown compression opt %u)", opt.type);
if (opt.level)
prt_printf(out, ":%u", opt.level);
}
+void bch2_opt_compression_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ return bch2_compression_opt_to_text(out, v);
+}
+
int bch2_opt_compression_validate(u64 v, struct printbuf *err)
{
if (!bch2_compression_opt_valid(v)) {
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index b938fc936365..607fd5e232c9 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -58,6 +58,8 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int bch2_opt_compression_validate(u64, struct printbuf *);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4860f8293a4f..d116f2f03db2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -13,6 +13,7 @@
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
+#include "rebalance.h"
#include "subvolume.h"
#include "trace.h"
@@ -251,11 +252,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p);
- if (ret)
- goto err;
-
- ret = bch2_trans_update(trans, &iter, insert,
+ k.k->p, insert->k.p) ?:
+ bch2_bkey_set_needs_rebalance(c, insert,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ccb62fa22b04..0c60d49c3599 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
+#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
@@ -757,18 +758,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
return i;
}
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *next = extent_entry_next(entry);
-
- /* stripes have ptrs, but their layout doesn't work with this code */
- BUG_ON(k.k->type == KEY_TYPE_stripe);
-
- memmove_u64s_down(entry, next,
- (u64 *) bkey_val_end(k) - (u64 *) next);
- k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
/*
* Returns pointer to the next entry after the one being dropped:
*/
@@ -1048,6 +1037,18 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
(u64) ec->idx, ec->block);
break;
}
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ prt_str(out, "rebalance: target ");
+ if (c)
+ bch2_target_to_text(out, c, r->target);
+ else
+ prt_printf(out, "%u", r->target);
+ prt_str(out, " compression ");
+ bch2_compression_opt_to_text(out, r->compression);
+ break;
+ }
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1223,9 +1224,18 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
}
have_ec = true;
break;
- case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ if (!bch2_compression_opt_valid(r->compression)) {
+ struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+ prt_printf(err, "invalid compression opt %u:%u",
+ opt.type, opt.level);
+ return -BCH_ERR_invalid_bkey;
+ }
break;
}
+ }
}
if (!nr_ptrs) {
@@ -1289,6 +1299,125 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned rewrite_ptrs = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+ rewrite_ptrs = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+
+ return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ /*
+ * If it's an indirect extent, we don't delete the rebalance entry when
+ * done so that we know what options were applied - check if it still
+ * needs work done:
+ */
+ if (r &&
+ k.k->type == KEY_TYPE_reflink_v &&
+ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+ r = NULL;
+
+ return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *r;
+ bool needs_rebalance;
+
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ /* get existing rebalance entry: */
+ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+ if (r) {
+ if (k.k->type == KEY_TYPE_reflink_v) {
+ /*
+ * indirect extents: existing options take precedence,
+ * so that we don't move extents back and forth if
+ * they're referenced by different inodes with different
+ * options:
+ */
+ if (r->target)
+ target = r->target;
+ if (r->compression)
+ compression = r->compression;
+ }
+
+ r->target = target;
+ r->compression = compression;
+ }
+
+ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+ if (needs_rebalance && !r) {
+ union bch_extent_entry *new = bkey_val_end(k);
+
+ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
+ new->rebalance.compression = compression;
+ new->rebalance.target = target;
+ new->rebalance.unused = 0;
+ k.k->u64s += extent_entry_u64s(new);
+ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+ /*
+ * For indirect extents, don't delete the rebalance entry when
+ * we're finished so that we know we specifically moved it or
+ * compressed it to its current location/compression type
+ */
+ extent_entry_drop(k, (union bch_extent_entry *) r);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ef1b9f18719d..9110acae7e3c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
memcpy_u64s_small(dst, new, extent_entry_u64s(new));
}
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -698,6 +710,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
void bch2_ptr_swab(struct bkey_s);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+ unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+ unsigned, unsigned);
+
/* Generic extent code: */
enum bch_extent_overlap {
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 119834cb8f9e..0979d5e05713 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -16,6 +16,7 @@
#include "io_misc.h"
#include "io_write.h"
#include "logged_ops.h"
+#include "rebalance.h"
#include "subvolume.h"
/* Overwrites whatever was present with zeroes: */
@@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+ struct bch_io_opts opts;
u64 dst_offset = le64_to_cpu(op->v.dst_offset);
u64 src_offset = le64_to_cpu(op->v.src_offset);
s64 shift = dst_offset - src_offset;
@@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bool insert = shift > 0;
int ret = 0;
+ ret = bch2_inum_opts_get(trans, inum, &opts);
+ if (ret)
+ return ret;
+
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
BTREE_ITER_INTENT);
@@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f7461f60d760..6d9c777213e3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -351,10 +351,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_extent_update(trans, inum, &iter, sk.k,
- &op->res,
- op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_extent_update(trans, inum, &iter, sk.k,
+ &op->res,
+ op->new_i_size, &op->i_sectors_delta,
+ op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -495,7 +498,6 @@ static void __bch2_write_index(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
- struct bkey_i *k;
unsigned dev;
int ret = 0;
@@ -505,14 +507,6 @@ static void __bch2_write_index(struct bch_write_op *op)
goto err;
}
- /*
- * probably not the ideal place to hook this in, but I don't
- * particularly want to plumb io_opts all the way through the btree
- * update stack right now
- */
- for_each_keylist_key(keys, k)
- bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 92403fa79f1f..6ba8574b4a69 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -3,13 +3,18 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "errcode.h"
+#include "error.h"
+#include "inode.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super-io.h"
#include "trace.h"
@@ -17,301 +22,398 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
-/*
- * Check if an extent should be moved:
- * returns -1 if it should not be moved, or
- * device of pointer that should be moved, if known, or INT_MAX if unknown
- */
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i;
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
- data_opts->rewrite_ptrs = 0;
- data_opts->target = io_opts->background_target;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
-
- if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k)) {
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (!p.ptr.cached &&
- p.crc.compression_type !=
- bch2_compression_opt_to_type(io_opts->background_compression))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
-
- if (io_opts->background_target) {
- const struct bch_extent_ptr *ptr;
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+ BCH_REBALANCE_STATES()
+ NULL
+#undef x
+};
- i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached &&
- !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
- bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie *cookie;
+ u64 v;
+ int ret;
- return data_opts->rewrite_ptrs != 0;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+ ret = PTR_ERR_OR_ZERO(cookie);
+ if (ret)
+ goto err;
+
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter.pos;
+ cookie->v.cookie = cpu_to_le64(v + 1);
+
+ ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-void bch2_rebalance_add_key(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- struct data_update_opts update_opts = { 0 };
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
- unsigned i;
+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ __bch2_set_rebalance_needs_scan(trans, inum));
+ rebalance_wakeup(c);
+ return ret;
+}
- if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
- return;
-
- i = 0;
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- if ((1U << i) && update_opts.rewrite_ptrs)
- if (atomic64_add_return(k.k->size,
- &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
- k.k->size)
- rebalance_wakeup(c);
- i++;
- }
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+ return bch2_set_rebalance_needs_scan(c, 0);
}
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
{
- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
- sectors)
- rebalance_wakeup(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 v;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ if (v == cookie)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-struct rebalance_work {
- int dev_most_full_idx;
- unsigned dev_most_full_percent;
- u64 dev_most_full_work;
- u64 dev_most_full_capacity;
- u64 total_work;
-};
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+ struct btree_iter *work_iter)
+{
+ return !kthread_should_stop()
+ ? bch2_btree_iter_peek(work_iter)
+ : bkey_s_c_null;
+}
-static void rebalance_work_accumulate(struct rebalance_work *w,
- u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- unsigned percent_full;
- u64 work = dev_work + unknown_dev;
+ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
- /* avoid divide by 0 */
- if (!capacity)
- return;
+ extent_entry_drop(bkey_i_to_s(n),
+ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+ return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter,
+ struct data_update_opts *data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+ work_pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek_slot(extent_iter);
+ if (bkey_err(k))
+ return k;
+
+ const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (!r) {
+ /* raced due to btree write buffer, nothing to do */
+ return bkey_s_c_null;
+ }
- if (work < dev_work || work < unknown_dev)
- work = U64_MAX;
- work = min(work, capacity);
+ memset(data_opts, 0, sizeof(*data_opts));
- percent_full = div64_u64(work * 100, capacity);
+ data_opts->rewrite_ptrs =
+ bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+ data_opts->target = r->target;
- if (percent_full >= w->dev_most_full_percent) {
- w->dev_most_full_idx = idx;
- w->dev_most_full_percent = percent_full;
- w->dev_most_full_work = work;
- w->dev_most_full_capacity = capacity;
+ if (!data_opts->rewrite_ptrs) {
+ /*
+ * device we would want to write to offline? devices in target
+ * changed?
+ *
+ * We'll now need a full scan before this extent is picked up
+ * again:
+ */
+ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
+ return bkey_s_c_null;
}
- if (w->total_work + dev_work >= w->total_work &&
- w->total_work + dev_work >= dev_work)
- w->total_work += dev_work;
+ return k;
}
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter)
{
- struct bch_dev *ca;
- struct rebalance_work ret = { .dev_most_full_idx = -1 };
- u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
- unsigned i;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ struct data_update_opts data_opts;
+ struct bch_io_opts io_opts;
+ struct bkey_s_c k;
+ struct bkey_buf sk;
+ int ret;
+
+ ctxt->stats = &r->work_stats;
+ r->state = BCH_REBALANCE_working;
- for_each_online_member(ca, c, i)
- rebalance_work_accumulate(&ret,
- atomic64_read(&ca->rebalance_work),
- unknown_dev,
- bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket),
- i);
+ bch2_bkey_buf_init(&sk);
- rebalance_work_accumulate(&ret,
- unknown_dev, 0, c->capacity, -1);
+ ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &data_opts));
+ if (ret || !k.k)
+ goto out;
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ if (ret)
+ goto out;
+
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+ if (ret) {
+ if (bch2_err_matches(ret, ENOMEM)) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto out;
+
+ /* skip it and continue, XXX signal failure */
+ ret = 0;
+ }
+out:
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- struct bch_dev *ca;
- unsigned i;
+ unsigned target, compression;
- for_each_online_member(ca, c, i)
- atomic64_set(&ca->rebalance_work, 0);
+ if (k.k->p.inode) {
+ target = io_opts->background_target;
+ compression = io_opts->background_compression ?: io_opts->compression;
+ } else {
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ target = r ? r->target : io_opts->background_target;
+ compression = r ? r->compression :
+ (io_opts->background_compression ?: io_opts->compression);
+ }
- atomic64_set(&c->rebalance.work_unknown_dev, 0);
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+ data_opts->target = target;
+ return data_opts->rewrite_ptrs != 0;
}
-static unsigned long curr_cputime(void)
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
- u64 utime, stime;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ int ret;
+
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+ ctxt->stats = &r->scan_stats;
- task_cputime_adjusted(current, &utime, &stime);
- return nsecs_to_jiffies(utime + stime);
+ if (!inum) {
+ r->scan_start = BBPOS_MIN;
+ r->scan_end = BBPOS_MAX;
+ } else {
+ r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
+ r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+ }
+
+ r->state = BCH_REBALANCE_scanning;
+
+ ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+ bch2_move_stats_exit(&r->scan_stats, trans->c);
+ return ret;
}
-static int bch2_rebalance_thread(void *arg)
+static void rebalance_wait(struct bch_fs *c)
{
- struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
+ struct bch_dev *ca;
struct io_clock *clock = &c->io_clock[WRITE];
- struct rebalance_work w, p;
- struct bch_move_stats move_stats;
- unsigned long start, prev_start;
- unsigned long prev_run_time, prev_run_cputime;
- unsigned long cputime, prev_cputime;
- u64 io_start;
- long throttle;
+ u64 now = atomic64_read(&clock->now);
+ u64 min_member_capacity = 128 * 2048;
+ unsigned i;
- set_freezable();
+ for_each_rw_member(ca, c, i)
+ min_member_capacity = min(min_member_capacity,
+ ca->mi.nbuckets * ca->mi.bucket_size);
+
+ r->wait_iotime_end = now + (min_member_capacity >> 6);
+
+ if (r->state != BCH_REBALANCE_waiting) {
+ r->wait_iotime_start = now;
+ r->wait_wallclock_start = ktime_get_real_ns();
+ r->state = BCH_REBALANCE_waiting;
+ }
- io_start = atomic64_read(&clock->now);
- p = rebalance_work(c);
- prev_start = jiffies;
- prev_cputime = curr_cputime();
+ bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
- bch2_move_stats_init(&move_stats, "rebalance");
- while (!kthread_wait_freezable(r->enabled)) {
- cond_resched();
+static int do_rebalance(struct moving_context *ctxt)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
- start = jiffies;
- cputime = curr_cputime();
+ bch2_move_stats_init(&r->work_stats, "rebalance_work");
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- prev_run_time = start - prev_start;
- prev_run_cputime = cputime - prev_cputime;
+ bch2_trans_iter_init(trans, &rebalance_work_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS);
- w = rebalance_work(c);
- BUG_ON(!w.dev_most_full_capacity);
+ while (!bch2_move_ratelimit(ctxt) &&
+ !kthread_wait_freezable(r->enabled)) {
+ bch2_trans_begin(trans);
- if (!w.total_work) {
- r->state = REBALANCE_WAITING;
- kthread_wait_freezable(rebalance_work(c).total_work);
+ ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
- }
+ if (ret || !k.k)
+ break;
- /*
- * If there isn't much work to do, throttle cpu usage:
- */
- throttle = prev_run_cputime * 100 /
- max(1U, w.dev_most_full_percent) -
- prev_run_time;
-
- if (w.dev_most_full_percent < 20 && throttle > 0) {
- r->throttled_until_iotime = io_start +
- div_u64(w.dev_most_full_capacity *
- (20 - w.dev_most_full_percent),
- 50);
-
- if (atomic64_read(&clock->now) + clock->max_slop <
- r->throttled_until_iotime) {
- r->throttled_until_cputime = start + throttle;
- r->state = REBALANCE_THROTTLED;
-
- bch2_kthread_io_clock_wait(clock,
- r->throttled_until_iotime,
- throttle);
- continue;
- }
- }
+ ret = k.k->type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k.k->p.inode,
+ le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+ : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
- /* minimum 1 mb/sec: */
- r->pd.rate.rate =
- max_t(u64, 1 << 11,
- r->pd.rate.rate *
- max(p.dev_most_full_percent, 1U) /
- max(w.dev_most_full_percent, 1U));
-
- io_start = atomic64_read(&clock->now);
- p = w;
- prev_start = start;
- prev_cputime = cputime;
-
- r->state = REBALANCE_RUNNING;
- memset(&move_stats, 0, sizeof(move_stats));
- rebalance_work_reset(c);
-
- bch2_move_data(c,
- BBPOS_MIN, BBPOS_MAX,
- /* ratelimiting disabled for now */
- NULL, /* &r->pd.rate, */
- &move_stats,
- writepoint_ptr(&c->rebalance_write_point),
- true,
- rebalance_pred, NULL);
+ bch2_btree_iter_advance(&rebalance_work_iter);
}
- return 0;
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_work_iter);
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ if (!ret &&
+ !kthread_should_stop() &&
+ !atomic64_read(&r->work_stats.sectors_seen) &&
+ !atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_trans_unlock(trans);
+ rebalance_wait(c);
+ }
+
+ bch_err_fn(c, ret);
+ return ret;
}
-void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+static int bch2_rebalance_thread(void *arg)
{
+ struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
- struct rebalance_work w = rebalance_work(c);
+ struct moving_context ctxt;
+ int ret;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
+ set_freezable();
- prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
- prt_tab(out);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+ writepoint_ptr(&c->rebalance_write_point),
+ true);
- prt_human_readable_u64(out, w.dev_most_full_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
- prt_newline(out);
+ while (!kthread_should_stop() &&
+ !(ret = do_rebalance(&ctxt)))
+ ;
- prt_printf(out, "total work:");
- prt_tab(out);
+ bch2_moving_ctxt_exit(&ctxt);
- prt_human_readable_u64(out, w.total_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, c->capacity << 9);
- prt_newline(out);
+ return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
- prt_printf(out, "rate:");
- prt_tab(out);
- prt_printf(out, "%u", r->pd.rate.rate);
+ prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
+ printbuf_indent_add(out, 2);
switch (r->state) {
- case REBALANCE_WAITING:
- prt_printf(out, "waiting");
+ case BCH_REBALANCE_waiting: {
+ u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+ prt_str(out, "io wait duration: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ prt_newline(out);
+
+ prt_str(out, "io wait remaining: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ prt_newline(out);
+
+ prt_str(out, "duration waited: ");
+ bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+ prt_newline(out);
break;
- case REBALANCE_THROTTLED:
- prt_printf(out, "throttled for %lu sec or ",
- (r->throttled_until_cputime - jiffies) / HZ);
- prt_human_readable_u64(out,
- (r->throttled_until_iotime -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- prt_printf(out, " io");
+ }
+ case BCH_REBALANCE_working:
+ bch2_move_stats_to_text(out, &r->work_stats);
break;
- case REBALANCE_RUNNING:
- prt_printf(out, "running");
+ case BCH_REBALANCE_scanning:
+ bch2_move_stats_to_text(out, &r->scan_stats);
break;
}
prt_newline(out);
+ printbuf_indent_sub(out, 2);
}
void bch2_rebalance_stop(struct bch_fs *c)
@@ -360,6 +462,4 @@ int bch2_rebalance_start(struct bch_fs *c)
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
-
- atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 7ade0bb81cce..28a52638f16c 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -4,6 +4,9 @@
#include "rebalance_types.h"
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
@@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
rcu_read_unlock();
}
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
- struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 7462a92e9598..0fffb536c1d0 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -2,25 +2,36 @@
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
+#include "bbpos_types.h"
#include "move_types.h"
-enum rebalance_state {
- REBALANCE_WAITING,
- REBALANCE_THROTTLED,
- REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES() \
+ x(waiting) \
+ x(working) \
+ x(scanning)
+
+enum bch_rebalance_states {
+#define x(t) BCH_REBALANCE_##t,
+ BCH_REBALANCE_STATES()
+#undef x
};
struct bch_fs_rebalance {
- struct task_struct __rcu *thread;
+ struct task_struct __rcu *thread;
struct bch_pd_controller pd;
- atomic64_t work_unknown_dev;
+ enum bch_rebalance_states state;
+ u64 wait_iotime_start;
+ u64 wait_iotime_end;
+ u64 wait_wallclock_start;
+
+ struct bch_move_stats work_stats;
- enum rebalance_state state;
- u64 throttled_until_iotime;
- unsigned long throttled_until_cputime;
+ struct bbpos scan_start;
+ struct bbpos scan_end;
+ struct bch_move_stats scan_stats;
- unsigned enabled:1;
+ unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 63faf70434ff..02025099c38f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -23,6 +23,7 @@
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
+#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index e2d8771909ef..515e3d62c2ac 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -42,6 +42,7 @@
x(check_nlinks, PASS_FSCK) \
x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 0) \
+ x(set_fs_needs_rebalance, 0) \
enum bch_recovery_pass {
#define x(n, when) BCH_RECOVERY_PASS_##n,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 540c78cd4b0c..dbbdf1955f76 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -7,6 +7,7 @@
#include "inode.h"
#include "io_misc.h"
#include "io_write.h"
+#include "rebalance.h"
#include "reflink.h"
#include "subvolume.h"
#include "super-io.h"
@@ -252,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
+ struct bch_io_opts opts;
struct bpos src_want;
u64 dst_done;
u32 dst_snapshot, src_snapshot;
@@ -269,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
trans = bch2_trans_get(c);
+ ret = bch2_inum_opts_get(trans, src_inum, &opts);
+ if (ret)
+ goto err;
+
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@@ -352,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_extent_update(trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_extent_update(trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res,
+ new_i_size, i_sectors_delta,
+ true);
bch2_disk_reservation_put(c, &disk_res);
}
bch2_trans_iter_exit(trans, &dst_iter);
@@ -386,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(trans, &inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-
+err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d176e46684cc..db2727e5cc5f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -212,7 +212,7 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
+read_attribute(rebalance_status);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -386,8 +386,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_copy_gc_wait)
bch2_copygc_wait_to_text(out, c);
- if (attr == &sysfs_rebalance_work)
- bch2_rebalance_work_to_text(out, c);
+ if (attr == &sysfs_rebalance_status)
+ bch2_rebalance_status_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@@ -646,7 +646,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_work,
+ &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -707,10 +707,8 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
- id == Opt_background_compression) && v) {
- bch2_rebalance_add_work(c, S64_MAX);
- rebalance_wakeup(c);
- }
+ id == Opt_background_compression) && v)
+ bch2_set_rebalance_needs_scan(c, 0);
ret = size;
err:
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index b069b1a62e25..74b41f567ab8 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -590,7 +590,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
if (value &&
(opt_id == Opt_background_compression ||
opt_id == Opt_background_target))
- bch2_rebalance_add_work(c, inode->v.i_blocks);
+ bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
}
--
2.42.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 6/6] bcachefs: rebalance_work
2023-10-24 19:14 ` [PATCH 6/6] bcachefs: rebalance_work Kent Overstreet
@ 2023-11-01 17:02 ` Nathan Chancellor
2023-11-01 17:07 ` Nathan Chancellor
0 siblings, 1 reply; 10+ messages in thread
From: Nathan Chancellor @ 2023-11-01 17:02 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-bcachefs, llvm
Hi Kent,
On Tue, Oct 24, 2023 at 03:14:11PM -0400, Kent Overstreet wrote:
> This adds a new btree, rebalance_work, to eliminate scanning required
> for finding extents that need work done on them in the background - i.e.
> for the background_target and background_compression options.
>
> rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
> extent in the extents or reflink btree at the same pos.
>
> A new extent field is added, bch_extent_rebalance, which indicates that
> this extent has work that needs to be done in the background - and which
> options to use. This allows per-inode options to be propagated to
> indirect extents - at least in some circumstances. In this patch,
> changing IO options on a file will not propagate the new options to
> indirect extents pointed to by that file.
>
> Updating (setting/clearing) the rebalance_work btree is done by the
> extent trigger, which looks at the bch_extent_rebalance field.
>
> Scanning is still requrired after changing IO path options - either just
> for a given inode, or for the whole filesystem. We indicate that
> scanning is required by adding a KEY_TYPE_cookie key to the
> rebalance_work btree: the cookie counter is so that we can detect that
> scanning is still required when an option has been flipped mid-way
> through an existing scan.
>
> Future possible work:
> - Propagate options to indirect extents when being changed
> - Add other IO path options - nr_replicas, ec, to rebalance_work so
> they can be applied in the background when they change
> - Add a counter, for bcachefs fs usage output, showing the pending
> amount of rebalance work: we'll probably want to do this after the
> disk space accounting rewrite (moving it to a new btree)
>
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
<snip>
> diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
> index 540c78cd4b0c..dbbdf1955f76 100644
> --- a/fs/bcachefs/reflink.c
> +++ b/fs/bcachefs/reflink.c
> @@ -7,6 +7,7 @@
> #include "inode.h"
> #include "io_misc.h"
> #include "io_write.h"
> +#include "rebalance.h"
> #include "reflink.h"
> #include "subvolume.h"
> #include "super-io.h"
> @@ -252,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
> struct bpos dst_start = POS(dst_inum.inum, dst_offset);
> struct bpos src_start = POS(src_inum.inum, src_offset);
> struct bpos dst_end = dst_start, src_end = src_start;
> + struct bch_io_opts opts;
> struct bpos src_want;
> u64 dst_done;
> u32 dst_snapshot, src_snapshot;
> @@ -269,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
> bch2_bkey_buf_init(&new_src);
> trans = bch2_trans_get(c);
>
> + ret = bch2_inum_opts_get(trans, src_inum, &opts);
> + if (ret)
> + goto err;
> +
Not sure if this has been reported or fixed yet but this appears to
introduce a valid clang warning:
fs/bcachefs/reflink.c:275:6: error: variable 'dst_done' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
275 | if (ret)
| ^~~
fs/bcachefs/reflink.c:405:9: note: uninitialized use occurs here
405 | return dst_done ?: ret ?: ret2;
| ^~~~~~~~
fs/bcachefs/reflink.c:275:2: note: remove the 'if' if its condition is always false
275 | if (ret)
| ^~~~~~~~
276 | goto err;
| ~~~~~~~~
fs/bcachefs/reflink.c:258:14: note: initialize the variable 'dst_done' to silence this warning
258 | u64 dst_done;
| ^
| = 0
1 error generated.
I tried to reason my way through a patch but I am a little lost, hence
just the report :)
> bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
> BTREE_ITER_INTENT);
> bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
> @@ -352,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
> min(src_k.k->p.offset - src_want.offset,
> dst_end.offset - dst_iter.pos.offset));
>
> - ret = bch2_extent_update(trans, dst_inum, &dst_iter,
> - new_dst.k, &disk_res,
> - new_i_size, i_sectors_delta,
> - true);
> + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
> + opts.background_target,
> + opts.background_compression) ?:
> + bch2_extent_update(trans, dst_inum, &dst_iter,
> + new_dst.k, &disk_res,
> + new_i_size, i_sectors_delta,
> + true);
> bch2_disk_reservation_put(c, &disk_res);
> }
> bch2_trans_iter_exit(trans, &dst_iter);
> @@ -386,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
>
> bch2_trans_iter_exit(trans, &inode_iter);
> } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
> -
> +err:
> bch2_trans_put(trans);
> bch2_bkey_buf_exit(&new_src, c);
> bch2_bkey_buf_exit(&new_dst, c);
Cheers,
Nathan
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 6/6] bcachefs: rebalance_work
2023-11-01 17:02 ` Nathan Chancellor
@ 2023-11-01 17:07 ` Nathan Chancellor
2023-11-02 1:11 ` Kent Overstreet
0 siblings, 1 reply; 10+ messages in thread
From: Nathan Chancellor @ 2023-11-01 17:07 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-bcachefs, llvm
On Wed, Nov 01, 2023 at 10:02:55AM -0700, Nathan Chancellor wrote:
> Hi Kent,
>
> On Tue, Oct 24, 2023 at 03:14:11PM -0400, Kent Overstreet wrote:
> > This adds a new btree, rebalance_work, to eliminate scanning required
> > for finding extents that need work done on them in the background - i.e.
> > for the background_target and background_compression options.
> >
> > rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
> > extent in the extents or reflink btree at the same pos.
> >
> > A new extent field is added, bch_extent_rebalance, which indicates that
> > this extent has work that needs to be done in the background - and which
> > options to use. This allows per-inode options to be propagated to
> > indirect extents - at least in some circumstances. In this patch,
> > changing IO options on a file will not propagate the new options to
> > indirect extents pointed to by that file.
> >
> > Updating (setting/clearing) the rebalance_work btree is done by the
> > extent trigger, which looks at the bch_extent_rebalance field.
> >
> > Scanning is still requrired after changing IO path options - either just
> > for a given inode, or for the whole filesystem. We indicate that
> > scanning is required by adding a KEY_TYPE_cookie key to the
> > rebalance_work btree: the cookie counter is so that we can detect that
> > scanning is still required when an option has been flipped mid-way
> > through an existing scan.
> >
> > Future possible work:
> > - Propagate options to indirect extents when being changed
> > - Add other IO path options - nr_replicas, ec, to rebalance_work so
> > they can be applied in the background when they change
> > - Add a counter, for bcachefs fs usage output, showing the pending
> > amount of rebalance work: we'll probably want to do this after the
> > disk space accounting rewrite (moving it to a new btree)
> >
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
>
> <snip>
>
> > diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
> > index 540c78cd4b0c..dbbdf1955f76 100644
> > --- a/fs/bcachefs/reflink.c
> > +++ b/fs/bcachefs/reflink.c
> > @@ -7,6 +7,7 @@
> > #include "inode.h"
> > #include "io_misc.h"
> > #include "io_write.h"
> > +#include "rebalance.h"
> > #include "reflink.h"
> > #include "subvolume.h"
> > #include "super-io.h"
> > @@ -252,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
> > struct bpos dst_start = POS(dst_inum.inum, dst_offset);
> > struct bpos src_start = POS(src_inum.inum, src_offset);
> > struct bpos dst_end = dst_start, src_end = src_start;
> > + struct bch_io_opts opts;
> > struct bpos src_want;
> > u64 dst_done;
> > u32 dst_snapshot, src_snapshot;
> > @@ -269,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
> > bch2_bkey_buf_init(&new_src);
> > trans = bch2_trans_get(c);
> >
> > + ret = bch2_inum_opts_get(trans, src_inum, &opts);
> > + if (ret)
> > + goto err;
> > +
>
> Not sure if this has been reported or fixed yet but this appears to
> introduce a valid clang warning:
>
> fs/bcachefs/reflink.c:275:6: error: variable 'dst_done' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
> 275 | if (ret)
> | ^~~
> fs/bcachefs/reflink.c:405:9: note: uninitialized use occurs here
> 405 | return dst_done ?: ret ?: ret2;
> | ^~~~~~~~
> fs/bcachefs/reflink.c:275:2: note: remove the 'if' if its condition is always false
> 275 | if (ret)
> | ^~~~~~~~
> 276 | goto err;
> | ~~~~~~~~
> fs/bcachefs/reflink.c:258:14: note: initialize the variable 'dst_done' to silence this warning
> 258 | u64 dst_done;
> | ^
> | = 0
> 1 error generated.
>
> I tried to reason my way through a patch but I am a little lost, hence
> just the report :)
Actually, it just seems like dst_done should be explicitly zero
initialized, so that ret is used as the return value. Don't know why I
was as confused as I was :) would you like a formal patch or to just
squash it in?
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index eb31df605c2e..6e1bfe9feb59 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -255,7 +255,7 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_end = dst_start, src_end = src_start;
struct bch_io_opts opts;
struct bpos src_want;
- u64 dst_done;
+ u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
> > bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
> > BTREE_ITER_INTENT);
> > bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
> > @@ -352,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
> > min(src_k.k->p.offset - src_want.offset,
> > dst_end.offset - dst_iter.pos.offset));
> >
> > - ret = bch2_extent_update(trans, dst_inum, &dst_iter,
> > - new_dst.k, &disk_res,
> > - new_i_size, i_sectors_delta,
> > - true);
> > + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
> > + opts.background_target,
> > + opts.background_compression) ?:
> > + bch2_extent_update(trans, dst_inum, &dst_iter,
> > + new_dst.k, &disk_res,
> > + new_i_size, i_sectors_delta,
> > + true);
> > bch2_disk_reservation_put(c, &disk_res);
> > }
> > bch2_trans_iter_exit(trans, &dst_iter);
> > @@ -386,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
> >
> > bch2_trans_iter_exit(trans, &inode_iter);
> > } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
> > -
> > +err:
> > bch2_trans_put(trans);
> > bch2_bkey_buf_exit(&new_src, c);
> > bch2_bkey_buf_exit(&new_dst, c);
>
> Cheers,
> Nathan
>
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 6/6] bcachefs: rebalance_work
2023-11-01 17:07 ` Nathan Chancellor
@ 2023-11-02 1:11 ` Kent Overstreet
0 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2023-11-02 1:11 UTC (permalink / raw)
To: Nathan Chancellor; +Cc: linux-bcachefs, llvm
On Wed, Nov 01, 2023 at 10:07:43AM -0700, Nathan Chancellor wrote:
> On Wed, Nov 01, 2023 at 10:02:55AM -0700, Nathan Chancellor wrote:
> > Hi Kent,
> >
> > On Tue, Oct 24, 2023 at 03:14:11PM -0400, Kent Overstreet wrote:
> > > This adds a new btree, rebalance_work, to eliminate scanning required
> > > for finding extents that need work done on them in the background - i.e.
> > > for the background_target and background_compression options.
> > >
> > > rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
> > > extent in the extents or reflink btree at the same pos.
> > >
> > > A new extent field is added, bch_extent_rebalance, which indicates that
> > > this extent has work that needs to be done in the background - and which
> > > options to use. This allows per-inode options to be propagated to
> > > indirect extents - at least in some circumstances. In this patch,
> > > changing IO options on a file will not propagate the new options to
> > > indirect extents pointed to by that file.
> > >
> > > Updating (setting/clearing) the rebalance_work btree is done by the
> > > extent trigger, which looks at the bch_extent_rebalance field.
> > >
> > > Scanning is still requrired after changing IO path options - either just
> > > for a given inode, or for the whole filesystem. We indicate that
> > > scanning is required by adding a KEY_TYPE_cookie key to the
> > > rebalance_work btree: the cookie counter is so that we can detect that
> > > scanning is still required when an option has been flipped mid-way
> > > through an existing scan.
> > >
> > > Future possible work:
> > > - Propagate options to indirect extents when being changed
> > > - Add other IO path options - nr_replicas, ec, to rebalance_work so
> > > they can be applied in the background when they change
> > > - Add a counter, for bcachefs fs usage output, showing the pending
> > > amount of rebalance work: we'll probably want to do this after the
> > > disk space accounting rewrite (moving it to a new btree)
> > >
> > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> >
> > <snip>
> >
> > > diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
> > > index 540c78cd4b0c..dbbdf1955f76 100644
> > > --- a/fs/bcachefs/reflink.c
> > > +++ b/fs/bcachefs/reflink.c
> > > @@ -7,6 +7,7 @@
> > > #include "inode.h"
> > > #include "io_misc.h"
> > > #include "io_write.h"
> > > +#include "rebalance.h"
> > > #include "reflink.h"
> > > #include "subvolume.h"
> > > #include "super-io.h"
> > > @@ -252,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
> > > struct bpos dst_start = POS(dst_inum.inum, dst_offset);
> > > struct bpos src_start = POS(src_inum.inum, src_offset);
> > > struct bpos dst_end = dst_start, src_end = src_start;
> > > + struct bch_io_opts opts;
> > > struct bpos src_want;
> > > u64 dst_done;
> > > u32 dst_snapshot, src_snapshot;
> > > @@ -269,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
> > > bch2_bkey_buf_init(&new_src);
> > > trans = bch2_trans_get(c);
> > >
> > > + ret = bch2_inum_opts_get(trans, src_inum, &opts);
> > > + if (ret)
> > > + goto err;
> > > +
> >
> > Not sure if this has been reported or fixed yet but this appears to
> > introduce a valid clang warning:
> >
> > fs/bcachefs/reflink.c:275:6: error: variable 'dst_done' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
> > 275 | if (ret)
> > | ^~~
> > fs/bcachefs/reflink.c:405:9: note: uninitialized use occurs here
> > 405 | return dst_done ?: ret ?: ret2;
> > | ^~~~~~~~
> > fs/bcachefs/reflink.c:275:2: note: remove the 'if' if its condition is always false
> > 275 | if (ret)
> > | ^~~~~~~~
> > 276 | goto err;
> > | ~~~~~~~~
> > fs/bcachefs/reflink.c:258:14: note: initialize the variable 'dst_done' to silence this warning
> > 258 | u64 dst_done;
> > | ^
> > | = 0
> > 1 error generated.
> >
> > I tried to reason my way through a patch but I am a little lost, hence
> > just the report :)
>
> Actually, it just seems like dst_done should be explicitly zero
> initialized, so that ret is used as the return value. Don't know why I
> was as confused as I was :) would you like a formal patch or to just
> squash it in?
Squashed it in, thanks :)
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2023-11-02 1:11 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-10-24 19:14 [PATCH 0/6] rebalance_work btree Kent Overstreet
2023-10-24 19:14 ` [PATCH 1/6] bcachefs: move.c exports, refactoring Kent Overstreet
2023-10-24 19:14 ` [PATCH 2/6] bcachefs: moving_context now owns a btree_trans Kent Overstreet
2023-10-24 19:14 ` [PATCH 3/6] bcachefs: move: convert to bbpos Kent Overstreet
2023-10-24 19:14 ` [PATCH 4/6] bcachefs: move: move_stats refactoring Kent Overstreet
2023-10-24 19:14 ` [PATCH 5/6] bcachefs: bch2_inum_opts_get() Kent Overstreet
2023-10-24 19:14 ` [PATCH 6/6] bcachefs: rebalance_work Kent Overstreet
2023-11-01 17:02 ` Nathan Chancellor
2023-11-01 17:07 ` Nathan Chancellor
2023-11-02 1:11 ` Kent Overstreet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox