public inbox for linux-bcachefs@vger.kernel.org
 help / color / mirror / Atom feed
From: Brian Foster <bfoster@redhat.com>
To: Kent Overstreet <kent.overstreet@linux.dev>
Cc: linux-bcachefs@vger.kernel.org, linux-kernel@vger.kernel.org,
	djwong@kernel.org
Subject: Re: [PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys
Date: Tue, 27 Feb 2024 10:50:23 -0500	[thread overview]
Message-ID: <Zd4Ev8XPzn5AJ8K9@bfoster> (raw)
In-Reply-To: <20240225023826.2413565-4-kent.overstreet@linux.dev>

On Sat, Feb 24, 2024 at 09:38:05PM -0500, Kent Overstreet wrote:
> Teach the btree write buffer how to accumulate accounting keys - instead
> of having the newer key overwrite the older key as we do with other
> updates, we need to add them together.
> 
> Also, add a flag so that write buffer flush knows when journal replay is
> finished flushing accounting, and teach it to hold accounting keys until
> that flag is set.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
>  fs/bcachefs/bcachefs.h           |  1 +
>  fs/bcachefs/btree_write_buffer.c | 66 +++++++++++++++++++++++++++-----
>  fs/bcachefs/recovery.c           |  3 ++
>  3 files changed, 61 insertions(+), 9 deletions(-)
> 
...
> diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
> index b77e7b382b66..002a0762fc85 100644
> --- a/fs/bcachefs/btree_write_buffer.c
> +++ b/fs/bcachefs/btree_write_buffer.c
> @@ -5,6 +5,7 @@
>  #include "btree_update.h"
>  #include "btree_update_interior.h"
>  #include "btree_write_buffer.h"
> +#include "disk_accounting.h"
>  #include "error.h"
>  #include "journal.h"
>  #include "journal_io.h"
> @@ -123,7 +124,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
>  
>  static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
>  			       struct btree_write_buffered_key *wb,
> -			       bool *write_locked, size_t *fast)
> +			       bool *write_locked,
> +			       bool *accounting_accumulated,
> +			       size_t *fast)
>  {
>  	struct btree_path *path;
>  	int ret;
> @@ -136,6 +139,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
>  	if (ret)
>  		return ret;
>  
> +	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
> +		struct bkey u;
> +		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
> +
> +		if (k.k->type == KEY_TYPE_accounting)
> +			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
> +						   bkey_s_c_to_accounting(k));

So it looks like we're accumulating from the btree key into the write
buffer key. Is this so the following code will basically insert a new
btree key based on the value of the write buffer key?

> +	}
> +	*accounting_accumulated = true;
> +
>  	/*
>  	 * We can't clone a path that has write locks: unshare it now, before
>  	 * set_pos and traverse():
> @@ -248,8 +261,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  	struct journal *j = &c->journal;
>  	struct btree_write_buffer *wb = &c->btree_write_buffer;
>  	struct btree_iter iter = { NULL };
> -	size_t skipped = 0, fast = 0, slowpath = 0;
> +	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
>  	bool write_locked = false;
> +	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
>  	int ret = 0;
>  
>  	bch2_trans_unlock(trans);
> @@ -284,17 +298,29 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  
>  	darray_for_each(wb->sorted, i) {
>  		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
> +		bool accounting_accumulated = false;

Should this live within the interior flush loop?

>  
>  		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
>  			prefetch(&wb->flushing.keys.data[n->idx]);
>  
>  		BUG_ON(!k->journal_seq);
>  
> +		if (!accounting_replay_done &&
> +		    k->k.k.type == KEY_TYPE_accounting) {
> +			slowpath++;
> +			continue;
> +		}
> +
>  		if (i + 1 < &darray_top(wb->sorted) &&
>  		    wb_key_eq(i, i + 1)) {
>  			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
>  
> -			skipped++;
> +			if (k->k.k.type == KEY_TYPE_accounting &&
> +			    n->k.k.type == KEY_TYPE_accounting)
> +				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
> +							   bkey_i_to_s_c_accounting(&k->k));
> +
> +			overwritten++;
>  			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
>  			k->journal_seq = 0;
>  			continue;
> @@ -325,7 +351,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  				break;
>  			}
>  
> -			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
> +			ret = wb_flush_one(trans, &iter, k, &write_locked,
> +					   &accounting_accumulated, &fast);
>  			if (!write_locked)
>  				bch2_trans_begin(trans);
>  		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
> @@ -361,8 +388,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  			if (!i->journal_seq)
>  				continue;
>  
> -			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> -						bch2_btree_write_buffer_journal_flush);
> +			if (!accounting_replay_done &&
> +			    i->k.k.type == KEY_TYPE_accounting) {
> +				could_not_insert++;
> +				continue;
> +			}
> +
> +			if (!could_not_insert)
> +				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
> +							bch2_btree_write_buffer_journal_flush);

Hmm.. so this is sane because the slowpath runs in journal sorted order,
right?

>  
>  			bch2_trans_begin(trans);
>  
> @@ -375,13 +409,27 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
>  					btree_write_buffered_insert(trans, i));
>  			if (ret)
>  				goto err;
> +
> +			i->journal_seq = 0;
> +		}
> +

		/*
		 * Condense the remaining keys <reasons reasons>...??
		 */

> +		if (could_not_insert) {
> +			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
> +
> +			darray_for_each(wb->flushing.keys, i)
> +				if (i->journal_seq)
> +					*dst++ = *i;
> +			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
>  		}
>  	}
>  err:
> +	if (ret || !could_not_insert) {
> +		bch2_journal_pin_drop(j, &wb->flushing.pin);
> +		wb->flushing.keys.nr = 0;
> +	}
> +
>  	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
> -	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
> -	bch2_journal_pin_drop(j, &wb->flushing.pin);
> -	wb->flushing.keys.nr = 0;
> +	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);

I feel like the last time I looked at the write buffer stuff the flush
wasn't reentrant in this way. I.e., the flush switched out the active
buffer and so had to process all entries in the current buffer (or
something like that). Has something changed or do I misunderstand?

>  	return ret;
>  }
>  
> diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
> index 6829d80bd181..b8289af66c8e 100644
> --- a/fs/bcachefs/recovery.c
> +++ b/fs/bcachefs/recovery.c
> @@ -228,6 +228,8 @@ static int bch2_journal_replay(struct bch_fs *c)
>  			goto err;
>  	}
>  
> +	set_bit(BCH_FS_accounting_replay_done, &c->flags);
> +

I assume this ties into the question on the previous patch..

Related question.. if the write buffer can't flush during journal
replay, is there concern/risk of overflowing it?

Brian

>  	/*
>  	 * First, attempt to replay keys in sorted order. This is more
>  	 * efficient - better locality of btree access -  but some might fail if
> @@ -1204,6 +1206,7 @@ int bch2_fs_initialize(struct bch_fs *c)
>  	 * set up the journal.pin FIFO and journal.cur pointer:
>  	 */
>  	bch2_fs_journal_start(&c->journal, 1);
> +	set_bit(BCH_FS_accounting_replay_done, &c->flags);
>  	bch2_journal_set_replay_done(&c->journal);
>  
>  	ret = bch2_fs_read_write_early(c);
> -- 
> 2.43.0
> 


  reply	other threads:[~2024-02-27 15:48 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-25  2:38 [PATCH 00/21] bcachefs disk accounting rewrite Kent Overstreet
2024-02-25  2:38 ` [PATCH 01/21] bcachefs: KEY_TYPE_accounting Kent Overstreet
2024-02-27 15:49   ` Brian Foster
2024-02-28 19:39     ` Kent Overstreet
2024-02-29 18:43       ` Brian Foster
2024-02-29 21:24         ` Kent Overstreet
2024-03-01 15:03           ` Brian Foster
2024-03-01 19:30             ` Kent Overstreet
2024-02-25  2:38 ` [PATCH 02/21] bcachefs: Accumulate accounting keys in journal replay Kent Overstreet
2024-02-27 15:49   ` Brian Foster
2024-02-28 20:06     ` Kent Overstreet
2024-02-25  2:38 ` [PATCH 03/21] bcachefs: btree write buffer knows how to accumulate bch_accounting keys Kent Overstreet
2024-02-27 15:50   ` Brian Foster [this message]
2024-02-28 22:42     ` Kent Overstreet
2024-02-29 18:44       ` Brian Foster
2024-02-29 20:25         ` Kent Overstreet
2024-02-25  2:38 ` [PATCH 04/21] bcachefs: Disk space accounting rewrite Kent Overstreet
2024-02-27 15:55   ` Brian Foster
2024-02-29  4:10     ` Kent Overstreet
2024-02-29 18:44       ` Brian Foster
2024-02-29 21:16         ` Kent Overstreet
2024-03-01 15:03           ` Brian Foster
2024-02-25  2:38 ` [PATCH 05/21] bcachefs: dev_usage updated by new accounting Kent Overstreet
2024-02-25  2:38 ` [PATCH 06/21] bcachefs: Kill bch2_fs_usage_initialize() Kent Overstreet
2024-02-25  2:38 ` [PATCH 07/21] bcachefs: Convert bch2_ioctl_fs_usage() to new accounting Kent Overstreet
2024-02-25  2:38 ` [PATCH 08/21] bcachefs: kill bch2_fs_usage_read() Kent Overstreet
2024-02-25  2:38 ` [PATCH 09/21] bcachefs: Kill writing old accounting to journal Kent Overstreet
2024-02-25  2:38 ` [PATCH 10/21] bcachefs: Delete journal-buf-sharded old style accounting Kent Overstreet
2024-02-25  2:38 ` [PATCH 11/21] bcachefs: Kill bch2_fs_usage_to_text() Kent Overstreet
2024-02-25  2:38 ` [PATCH 12/21] bcachefs: Kill fs_usage_online Kent Overstreet
2024-02-25  2:38 ` [PATCH 13/21] bcachefs: Kill replicas_journal_res Kent Overstreet
2024-02-25  2:38 ` [PATCH 14/21] bcachefs: Convert gc to new accounting Kent Overstreet
2024-02-25  2:38 ` [PATCH 15/21] bcachefs: Convert bch2_replicas_gc2() " Kent Overstreet
2024-02-25  2:38 ` [PATCH 16/21] bcachefs: bch2_verify_accounting_clean() Kent Overstreet
2024-02-25  2:38 ` [PATCH 17/21] bcachefs: Eytzinger accumulation for accounting keys Kent Overstreet
2024-02-25  2:38 ` [PATCH 18/21] bcachefs: bch_acct_compression Kent Overstreet
2024-02-25  2:38 ` [PATCH 19/21] bcachefs: Convert bch2_compression_stats_to_text() to new accounting Kent Overstreet
2024-02-25  2:38 ` [PATCH 20/21] bcachefs: bch2_fs_accounting_to_text() Kent Overstreet
2024-02-25  2:38 ` [PATCH 21/21] bcachefs: bch2_fs_usage_base_to_text() Kent Overstreet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zd4Ev8XPzn5AJ8K9@bfoster \
    --to=bfoster@redhat.com \
    --cc=djwong@kernel.org \
    --cc=kent.overstreet@linux.dev \
    --cc=linux-bcachefs@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox