linux-bcachefs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] bcachefs: bio per journal buf
@ 2024-01-31 20:16 Kent Overstreet
  2024-01-31 20:16 ` [PATCH 2/3] bcachefs: closure " Kent Overstreet
  2024-01-31 20:16 ` [PATCH 3/3] bcachefs: better journal pipelining Kent Overstreet
  0 siblings, 2 replies; 3+ messages in thread
From: Kent Overstreet @ 2024-01-31 20:16 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Prep work for having multiple journal writes in flight.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 40 ++++++++++++++++++++-----------------
 fs/bcachefs/journal_io.c    | 21 +++++++++----------
 fs/bcachefs/journal_types.h |  2 +-
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 13c9b6b5a7da..306d46654df3 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1234,13 +1234,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
-	kfree(ca->journal.bio);
-	kfree(ca->journal.buckets);
-	kfree(ca->journal.bucket_seq);
+	struct journal_device *ja = &ca->journal;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		kfree(ja->bio[i]);
+		ja->bio[i] = NULL;
+	}
 
-	ca->journal.bio		= NULL;
-	ca->journal.buckets	= NULL;
-	ca->journal.bucket_seq	= NULL;
+	kfree(ja->buckets);
+	kfree(ja->bucket_seq);
+	ja->buckets	= NULL;
+	ja->bucket_seq	= NULL;
 }
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1250,14 +1254,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 		bch2_sb_field_get(sb, journal);
 	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
 		bch2_sb_field_get(sb, journal_v2);
-	unsigned i, nr_bvecs;
 
 	ja->nr = 0;
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
 
-		for (i = 0; i < nr; i++)
+		for (unsigned i = 0; i < nr; i++)
 			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
 	} else if (journal_buckets) {
 		ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1267,13 +1270,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	if (!ja->bucket_seq)
 		return -BCH_ERR_ENOMEM_dev_journal_init;
 
-	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
-	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-	if (!ca->journal.bio)
-		return -BCH_ERR_ENOMEM_dev_journal_init;
-
-	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		ja->bio[i] = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		if (!ja->bio[i])
+			return -BCH_ERR_ENOMEM_dev_journal_init;
+		bio_init(ja->bio[i], NULL, ja->bio[i]->bi_inline_vecs, nr_bvecs, 0);
+	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
@@ -1281,14 +1285,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-		unsigned j, dst = 0;
+		unsigned dst = 0;
 
-		for (i = 0; i < nr; i++)
-			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+		for (unsigned i = 0; i < nr; i++)
+			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
 				ja->buckets[dst++] =
 					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
 	} else if (journal_buckets) {
-		for (i = 0; i < ja->nr; i++)
+		for (unsigned i = 0; i < ja->nr; i++)
 			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
 	}
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 5eb80686f0e0..92efacad80e6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1718,13 +1718,14 @@ static CLOSURE_CALLBACK(do_journal_write)
 {
 	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	struct bio *bio;
+	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
+	struct journal_buf *w = j->buf + buf_idx;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct journal_device *ja = &ca->journal;
+
 		if (!percpu_ref_tryget(&ca->io_ref)) {
 			/* XXX: fix this */
 			bch_err(c, "missing device for journal write\n");
@@ -1734,7 +1735,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
-		bio = ca->journal.bio;
+		struct bio *bio = ja->bio[buf_idx];
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1753,8 +1754,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		trace_and_count(c, journal_write, bio);
 		closure_bio_submit(bio, cl);
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] =
-			le64_to_cpu(w->data->seq);
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 	}
 
 	continue_at(cl, journal_write_done, j->wq);
@@ -1936,9 +1936,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
 {
 	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
+	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
+	struct journal_buf *w = j->buf + buf_idx;
 	struct bch_replicas_padded replicas;
-	struct bio *bio;
 	struct printbuf journal_debug_buf = PRINTBUF;
 	unsigned nr_rw_members = 0;
 	int ret;
@@ -2020,7 +2020,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		for_each_rw_member(c, ca) {
 			percpu_ref_get(&ca->io_ref);
 
-			bio = ca->journal.bio;
+			struct journal_device *ja = &ca->journal;
+			struct bio *bio = ja->bio[buf_idx];
 			bio_reset(bio, ca->disk_sb.bdev,
 				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 957be8949fb0..a4b07e5b5710 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -310,7 +310,7 @@ struct journal_device {
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio;
+	struct bio		*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/3] bcachefs: closure per journal buf
  2024-01-31 20:16 [PATCH 1/3] bcachefs: bio per journal buf Kent Overstreet
@ 2024-01-31 20:16 ` Kent Overstreet
  2024-01-31 20:16 ` [PATCH 3/3] bcachefs: better journal pipelining Kent Overstreet
  1 sibling, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2024-01-31 20:16 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Prep work for having multiple journal writes in flight.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 18 ++++++++++++------
 fs/bcachefs/journal_io.c    | 34 +++++++++++++++++++---------------
 fs/bcachefs/journal_types.h | 12 ++++++++++--
 3 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 306d46654df3..3508871cfd8a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -186,8 +186,10 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
-	if (write)
-		closure_call(&j->io, bch2_journal_write, j->wq, NULL);
+	if (write) {
+		struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK);
+		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+	}
 }
 
 /*
@@ -1273,10 +1275,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
 	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
-		ja->bio[i] = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+				     nr_bvecs), GFP_KERNEL);
 		if (!ja->bio[i])
 			return -BCH_ERR_ENOMEM_dev_journal_init;
-		bio_init(ja->bio[i], NULL, ja->bio[i]->bi_inline_vecs, nr_bvecs, 0);
+
+		ja->bio[i]->ca = ca;
+		ja->bio[i]->buf_idx = i;
+		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
 	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
@@ -1339,6 +1345,7 @@ int bch2_fs_journal_init(struct journal *j)
 		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data)
 			return -BCH_ERR_ENOMEM_journal_buf;
+		j->buf[i].idx = i;
 	}
 
 	j->pin.front = j->pin.back = 1;
@@ -1458,7 +1465,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	unsigned i;
 
 	spin_lock(&j->lock);
 	*seq = max(*seq, j->pin.front);
@@ -1476,7 +1482,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
 		list_for_each_entry(pin, &pin_list->list[i], list) {
 			prt_printf(out, "\t%px %ps", pin, pin->flush);
 			prt_newline(out);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 92efacad80e6..8dbf3b3fac32 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1595,9 +1595,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 
 static CLOSURE_CALLBACK(journal_write_done)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
 	u64 v, seq;
@@ -1673,8 +1673,9 @@ static CLOSURE_CALLBACK(journal_write_done)
 
 	if (!journal_state_count(new, new.unwritten_idx) &&
 	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 		spin_unlock(&j->lock);
-		closure_call(&j->io, bch2_journal_write, j->wq, NULL);
+		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
 	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
@@ -1695,31 +1696,32 @@ static CLOSURE_CALLBACK(journal_write_done)
 
 static void journal_write_endio(struct bio *bio)
 {
-	struct bch_dev *ca = bio->bi_private;
+	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+	struct bch_dev *ca = jbio->ca;
 	struct journal *j = &ca->fs->journal;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	unsigned long flags;
+	struct journal_buf *w = j->buf + jbio->buf_idx;
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
 			       "error writing journal entry %llu: %s",
 			       le64_to_cpu(w->data->seq),
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
+		unsigned long flags;
+
 		spin_lock_irqsave(&j->err_lock, flags);
 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
-	closure_put(&j->io);
+	closure_put(&w->io);
 	percpu_ref_put(&ca->io_ref);
 }
 
 static CLOSURE_CALLBACK(do_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
-	struct journal_buf *w = j->buf + buf_idx;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
@@ -1735,7 +1737,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
-		struct bio *bio = ja->bio[buf_idx];
+		struct bio *bio = &ja->bio[w->idx]->bio;
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1934,10 +1936,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 CLOSURE_CALLBACK(bch2_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
-	struct journal_buf *w = j->buf + buf_idx;
 	struct bch_replicas_padded replicas;
 	struct printbuf journal_debug_buf = PRINTBUF;
 	unsigned nr_rw_members = 0;
@@ -2016,12 +2017,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (ret)
 		goto err;
 
+	if (!JSET_NO_FLUSH(w->data))
+		closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
 	if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
 		for_each_rw_member(c, ca) {
 			percpu_ref_get(&ca->io_ref);
 
 			struct journal_device *ja = &ca->journal;
-			struct bio *bio = ja->bio[buf_idx];
+			struct bio *bio = &ja->bio[w->idx]->bio;
 			bio_reset(bio, ca->disk_sb.bdev,
 				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index a4b07e5b5710..4f7769aede33 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -18,6 +18,7 @@
  * the journal that are being staged or in flight.
  */
 struct journal_buf {
+	struct closure		io;
 	struct jset		*data;
 
 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -37,6 +38,7 @@ struct journal_buf {
 	bool			must_flush;	/* something wants a flush */
 	bool			separate_flush;
 	bool			need_flush_to_write_buffer;
+	u8			idx;
 };
 
 /*
@@ -150,6 +152,13 @@ enum journal_errors {
 
 typedef DARRAY(u64)		darray_u64;
 
+struct journal_bio {
+	struct bch_dev		*ca;
+	unsigned		buf_idx;
+
+	struct bio		bio;
+};
+
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
@@ -204,7 +213,6 @@ struct journal {
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
 
-	struct closure		io;
 	struct delayed_work	write_work;
 	struct workqueue_struct *wq;
 
@@ -310,7 +318,7 @@ struct journal_device {
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio[JOURNAL_BUF_NR];
+	struct journal_bio	*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 3/3] bcachefs: better journal pipelining
  2024-01-31 20:16 [PATCH 1/3] bcachefs: bio per journal buf Kent Overstreet
  2024-01-31 20:16 ` [PATCH 2/3] bcachefs: closure " Kent Overstreet
@ 2024-01-31 20:16 ` Kent Overstreet
  1 sibling, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2024-01-31 20:16 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Recently a severe performance regression was discovered, which bisected
to

  a6548c8b5eb5 bcachefs: Avoid flushing the journal in the discard path

It turns out the old behaviour, which issued excessive journal flushes,
worked around a performance issue where queueing delays would cause the
journal to not be able to write quickly enough and stall.

The journal flushes masked the issue because they periodically flushed
the device write cache, reducing write latency for non flushes.

This patch reworks the journalling code to allow more than one
(non-flush) write to be in flight at a time. With this patch, doing 4k
random writes and an iodepth of 128, we are now able to hit 560k iops to
a Samsung 970 EVO Plus - previously, we were stuck in the ~200k range.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 47 +++++++++++++++----
 fs/bcachefs/journal.h       |  7 +--
 fs/bcachefs/journal_io.c    | 94 ++++++++++++++++++++-----------------
 fs/bcachefs/journal_types.h | 11 +++--
 4 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3508871cfd8a..c040f69dfb5c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -74,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
 	prt_printf(out, "%li jiffies", buf->expires - jiffies);
 	prt_newline(out);
 
+	if (buf->write_done)
+		prt_printf(out, "write done\n");
+	else if (buf->write_allocated)
+		prt_printf(out, "write allocated\n");
+	else if (buf->write_started)
+		prt_printf(out, "write started\n");
+
 	printbuf_indent_sub(out, 2);
 }
 
@@ -175,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	return stuck;
 }
 
+void bch2_journal_do_writes(struct journal *j)
+{
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *w = j->buf + idx;
+
+		if (w->write_started && !w->write_allocated)
+			break;
+		if (w->write_started)
+			continue;
+
+		if (!journal_state_count(j->reservations, idx)) {
+			w->write_started = true;
+			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+		}
+
+		break;
+	}
+}
+
 /*
  * Final processing when the last reference of a journal buffer has been
  * dropped. Drop the pin list reference acquired at journal entry open and write
  * the buffer, if requested.
  */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 {
 	lockdep_assert_held(&j->lock);
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
-	if (write) {
-		struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK);
-		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
-	}
+	bch2_journal_do_writes(j);
 }
 
 /*
@@ -381,11 +407,14 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
 
 	bkey_extent_init(&buf->key);
-	buf->noflush	= false;
-	buf->must_flush	= false;
-	buf->separate_flush = false;
-	buf->flush_time	= 0;
+	buf->noflush		= false;
+	buf->must_flush		= false;
+	buf->separate_flush	= false;
+	buf->flush_time		= 0;
 	buf->need_flush_to_write_buffer = true;
+	buf->write_started	= false;
+	buf->write_allocated	= false;
+	buf->write_done		= false;
 
 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb8a..7c7528f839c5 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 }
 
 bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
 
 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
 {
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
 
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx))
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 }
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx)) {
 		spin_lock(&j->lock);
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 		spin_unlock(&j->lock);
 	}
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8dbf3b3fac32..388e8e15691a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1600,12 +1600,12 @@ static CLOSURE_CALLBACK(journal_write_done)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 v, seq;
+	u64 v, seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
 	time_stats_update(!JSET_NO_FLUSH(w->data)
-			       ? j->flush_write_time
-			       : j->noflush_write_time, j->write_start_time);
+			  ? j->flush_write_time
+			  : j->noflush_write_time, j->write_start_time);
 
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
@@ -1620,63 +1620,68 @@ static CLOSURE_CALLBACK(journal_write_done)
 	if (err)
 		bch2_fatal_error(c);
 
-	spin_lock(&j->lock);
-	seq = le64_to_cpu(w->data->seq);
+	closure_debug_destroy(cl);
 
+	spin_lock(&j->lock);
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = w->devs_written;
+	if (err && (!j->err_seq || seq < j->err_seq))
+		j->err_seq	= seq;
+	w->write_done = true;
+
+	bool completed = false;
+
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		w = j->buf + (seq & JOURNAL_BUF_MASK);
+		if (!w->write_done)
+			break;
 
-	if (!err) {
-		if (!JSET_NO_FLUSH(w->data)) {
+		if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
 
 			bch2_do_discards(c);
 			closure_wake_up(&c->freelist_wait);
-
 			bch2_reset_alloc_cursors(c);
 		}
-	} else if (!j->err_seq || seq < j->err_seq)
-		j->err_seq	= seq;
 
-	j->seq_ondisk		= seq;
+		j->seq_ondisk = seq;
 
-	/*
-	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-	 * more buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	if (j->watermark != BCH_WATERMARK_stripe)
-		journal_reclaim_kick(&c->journal);
+		/*
+		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+		 * more buckets:
+		 *
+		 * Must come before signaling write completion, for
+		 * bch2_fs_journal_stop():
+		 */
+		if (j->watermark != BCH_WATERMARK_stripe)
+			journal_reclaim_kick(&c->journal);
 
-	/* also must come before signalling write completion: */
-	closure_debug_destroy(cl);
+		v = atomic64_read(&j->reservations.counter);
+		do {
+			old.v = new.v = v;
+			BUG_ON(journal_state_count(new, new.unwritten_idx));
+			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
 
-	v = atomic64_read(&j->reservations.counter);
-	do {
-		old.v = new.v = v;
-		BUG_ON(journal_state_count(new, new.unwritten_idx));
+			new.unwritten_idx++;
+		} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
 
-		new.unwritten_idx++;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+		completed = true;
+	}
 
-	bch2_journal_reclaim_fast(j);
-	bch2_journal_space_available(j);
+	if (completed) {
+		bch2_journal_reclaim_fast(j);
+		bch2_journal_space_available(j);
 
-	track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
+		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
 
-	closure_wake_up(&w->wait);
-	journal_wake(j);
+		closure_wake_up(&w->wait);
+		journal_wake(j);
+	}
 
-	if (!journal_state_count(new, new.unwritten_idx) &&
-	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
-		struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
-		spin_unlock(&j->lock);
-		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
-	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
 		long delta = buf->expires - jiffies;
@@ -1686,12 +1691,10 @@ static CLOSURE_CALLBACK(journal_write_done)
 		 * previous entries still in flight - the current journal entry
 		 * might want to be written now:
 		 */
-
-		spin_unlock(&j->lock);
 		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
-	} else {
-		spin_unlock(&j->lock);
 	}
+
+	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1945,6 +1948,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	int ret;
 
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+	BUG_ON(w->write_allocated);
 
 	j->write_start_time = local_clock();
 
@@ -1988,12 +1992,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	 * bch2_journal_space_available():
 	 */
 	w->sectors = 0;
+	w->write_allocated = true;
 
 	/*
 	 * journal entry has been compacted and allocated, recalculate space
 	 * available:
 	 */
 	bch2_journal_space_available(j);
+	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 4f7769aede33..011f7a0d4ebd 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -34,10 +34,13 @@ struct journal_buf {
 	unsigned		disk_sectors;	/* maximum size entry could have been, if
 						   buf_size was bigger */
 	unsigned		u64s_reserved;
-	bool			noflush;	/* write has already been kicked off, and was noflush */
-	bool			must_flush;	/* something wants a flush */
-	bool			separate_flush;
-	bool			need_flush_to_write_buffer;
+	bool			noflush:1;	/* write has already been kicked off, and was noflush */
+	bool			must_flush:1;	/* something wants a flush */
+	bool			separate_flush:1;
+	bool			need_flush_to_write_buffer:1;
+	bool			write_started:1;
+	bool			write_allocated:1;
+	bool			write_done:1;
 	u8			idx;
 };
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-01-31 20:16 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-31 20:16 [PATCH 1/3] bcachefs: bio per journal buf Kent Overstreet
2024-01-31 20:16 ` [PATCH 2/3] bcachefs: closure " Kent Overstreet
2024-01-31 20:16 ` [PATCH 3/3] bcachefs: better journal pipelining Kent Overstreet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).