public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 6.6.y 0/2] backport to fix error propagation of split bios
@ 2026-04-17  2:51 Ruohan Lan
  2026-04-17  2:51 ` [PATCH 6.6.y 1/2] btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io() Ruohan Lan
  2026-04-17  2:51 ` [PATCH 6.6.y 2/2] btrfs: fix error propagation of split bios Ruohan Lan
  0 siblings, 2 replies; 3+ messages in thread
From: Ruohan Lan @ 2026-04-17  2:51 UTC (permalink / raw)
  To: gregkh, sashal, stable
  Cc: linux-btrfs, naohiro.aota, wqu, dsterba, ruohanlan, hch,
	johannes.thumshirn

Backport commit d48e1dea3931 ("btrfs: fix error propagation of split bios") 
to 6.6.y to fix error propagation of split bios.
It depends on commit 
9ca0e58cb752 ("btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io()").

In order to make a clean backport on stable kernel, backport 2 commits.

Naohiro Aota (1):
  btrfs: fix error propagation of split bios

Qu Wenruo (1):
  btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io()

 fs/btrfs/bio.c | 62 ++++++++++++++++++--------------------------------
 fs/btrfs/bio.h |  3 +++
 2 files changed, 25 insertions(+), 40 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 6.6.y 1/2] btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io()
  2026-04-17  2:51 [PATCH 6.6.y 0/2] backport to fix error propagation of split bios Ruohan Lan
@ 2026-04-17  2:51 ` Ruohan Lan
  2026-04-17  2:51 ` [PATCH 6.6.y 2/2] btrfs: fix error propagation of split bios Ruohan Lan
  1 sibling, 0 replies; 3+ messages in thread
From: Ruohan Lan @ 2026-04-17  2:51 UTC (permalink / raw)
  To: gregkh, sashal, stable
  Cc: linux-btrfs, naohiro.aota, wqu, dsterba, ruohanlan, hch,
	johannes.thumshirn

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit 9ca0e58cb752b09816f56f7a3147a39773d5e831 ]

There are only two differences between the two functions:

- btrfs_orig_bbio_end_io() does extra error propagation
  This is mostly to allow tolerance for write errors.

- btrfs_orig_bbio_end_io() does extra pending_ios check
  This check can handle both the original bio, or the cloned one.
  (All accounting happens in the original one).

This makes btrfs_orig_bbio_end_io() a much safer call.
In fact we already had a double freeing error due to usage of
btrfs_bio_end_io() in the error path of btrfs_submit_chunk().

So just move the whole content of btrfs_orig_bbio_end_io() into
btrfs_bio_end_io().

For normal paths this brings no change, because they are already calling
btrfs_orig_bbio_end_io() in the first place.

For error paths (not only inside bio.c but also external callers), this
change will introduce extra checks, especially for external callers, as
they will error out without submitting the btrfs bio.

But considering it's already in the error path, such slower but much
safer checks are still an overall win.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ruohan Lan <ruohanlan@aliyun.com>
---
 fs/btrfs/bio.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 6fa13be15f30..36b5ec9701d2 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -122,12 +122,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
 	}
 }
 
-void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
-{
-	bbio->bio.bi_status = status;
-	__btrfs_bio_end_io(bbio);
-}
-
 static void btrfs_orig_write_end_io(struct bio *bio);
 
 static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
@@ -149,8 +143,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
 	}
 }
 
-static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
+void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 {
+	bbio->bio.bi_status = status;
 	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
 		struct btrfs_bio *orig_bbio = bbio->private;
 
@@ -181,7 +176,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
 static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
 {
 	if (atomic_dec_and_test(&fbio->repair_count)) {
-		btrfs_orig_bbio_end_io(fbio->bbio);
+		btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
 		mempool_free(fbio, &btrfs_failed_bio_pool);
 	}
 }
@@ -322,7 +317,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
 	if (fbio)
 		btrfs_repair_done(fbio);
 	else
-		btrfs_orig_bbio_end_io(bbio);
+		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
 }
 
 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
@@ -356,7 +351,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
 	if (is_data_bbio(bbio))
 		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
 	else
-		btrfs_orig_bbio_end_io(bbio);
+		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
 }
 
 static void btrfs_simple_end_io(struct bio *bio)
@@ -376,7 +371,7 @@ static void btrfs_simple_end_io(struct bio *bio)
 	} else {
 		if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
 			btrfs_record_physical_zoned(bbio);
-		btrfs_orig_bbio_end_io(bbio);
+		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
 	}
 }
 
@@ -390,7 +385,7 @@ static void btrfs_raid56_end_io(struct bio *bio)
 	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
 		btrfs_check_read_bio(bbio, NULL);
 	else
-		btrfs_orig_bbio_end_io(bbio);
+		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
 
 	btrfs_put_bioc(bioc);
 }
@@ -420,7 +415,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
 	if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 
-	btrfs_orig_bbio_end_io(bbio);
+	btrfs_bio_end_io(bbio, bbio->bio.bi_status);
 	btrfs_put_bioc(bioc);
 }
 
@@ -586,7 +581,7 @@ static void run_one_async_done(struct btrfs_work *work)
 
 	/* If an error occurred we just want to clean up the bio and move on. */
 	if (bio->bi_status) {
-		btrfs_orig_bbio_end_io(async->bbio);
+		btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
 		return;
 	}
 
@@ -750,11 +745,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 		ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
 		ASSERT(remaining);
 
-		remaining->bio.bi_status = ret;
-		btrfs_orig_bbio_end_io(remaining);
+		btrfs_bio_end_io(remaining, ret);
 	}
-	bbio->bio.bi_status = ret;
-	btrfs_orig_bbio_end_io(bbio);
+	btrfs_bio_end_io(bbio, ret);
 	/* Do not submit another chunk */
 	return true;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 6.6.y 2/2] btrfs: fix error propagation of split bios
  2026-04-17  2:51 [PATCH 6.6.y 0/2] backport to fix error propagation of split bios Ruohan Lan
  2026-04-17  2:51 ` [PATCH 6.6.y 1/2] btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io() Ruohan Lan
@ 2026-04-17  2:51 ` Ruohan Lan
  1 sibling, 0 replies; 3+ messages in thread
From: Ruohan Lan @ 2026-04-17  2:51 UTC (permalink / raw)
  To: gregkh, sashal, stable
  Cc: linux-btrfs, naohiro.aota, wqu, dsterba, ruohanlan, hch,
	johannes.thumshirn

From: Naohiro Aota <naohiro.aota@wdc.com>

[ Upstream commit d48e1dea3931de64c26717adc2b89743c7ab6594 ]

The purpose of btrfs_bbio_propagate_error() shall be propagating an error
of split bio to its original btrfs_bio, and tell the error to the upper
layer. However, it's not working well on some cases.

* Case 1. Immediate (or quick) end_bio with an error

When btrfs sends btrfs_bio to mirrored devices, btrfs calls
btrfs_bio_end_io() when all the mirroring bios are completed. If that
btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
accesses the orig_bbio's bio context to increase the error count.

That works well in most cases. However, if the end_io is called enough
fast, orig_bbio's (remaining part after split) bio context may not be
properly set at that time. Since the bio context is set when the orig_bbio
(the last btrfs_bio) is sent to devices, that might be too late for earlier
split btrfs_bio's completion.  That will result in NULL pointer
dereference.

That bug is easily reproducible by running btrfs/146 on zoned devices [1]
and it shows the following trace.

[1] You need raid-stripe-tree feature as it create "-d raid0 -m raid1" FS.

  BUG: kernel NULL pointer dereference, address: 0000000000000020
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: Oops: 0000 [#1] PREEMPT SMP PTI
  CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  Workqueue: writeback wb_workfn (flush-btrfs-5)
  RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
  BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
  RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
  RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
  RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
  R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
  R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
  FS:  0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <TASK>
   ? __die_body.cold+0x19/0x26
   ? page_fault_oops+0x13e/0x2b0
   ? _printk+0x58/0x73
   ? do_user_addr_fault+0x5f/0x750
   ? exc_page_fault+0x76/0x240
   ? asm_exc_page_fault+0x22/0x30
   ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
   ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
   btrfs_orig_write_end_io+0x51/0x90 [btrfs]
   dm_submit_bio+0x5c2/0xa50 [dm_mod]
   ? find_held_lock+0x2b/0x80
   ? blk_try_enter_queue+0x90/0x1e0
   __submit_bio+0xe0/0x130
   ? ktime_get+0x10a/0x160
   ? lockdep_hardirqs_on+0x74/0x100
   submit_bio_noacct_nocheck+0x199/0x410
   btrfs_submit_bio+0x7d/0x150 [btrfs]
   btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
   ? lockdep_hardirqs_on+0x74/0x100
   ? __folio_start_writeback+0x10/0x2c0
   btrfs_submit_bbio+0x1c/0x40 [btrfs]
   submit_one_bio+0x44/0x60 [btrfs]
   submit_extent_folio+0x13f/0x330 [btrfs]
   ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
   extent_writepage_io+0x18b/0x360 [btrfs]
   extent_write_locked_range+0x17c/0x340 [btrfs]
   ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
   run_delalloc_cow+0x71/0xd0 [btrfs]
   btrfs_run_delalloc_range+0x176/0x500 [btrfs]
   ? find_lock_delalloc_range+0x119/0x260 [btrfs]
   writepage_delalloc+0x2ab/0x480 [btrfs]
   extent_write_cache_pages+0x236/0x7d0 [btrfs]
   btrfs_writepages+0x72/0x130 [btrfs]
   do_writepages+0xd4/0x240
   ? find_held_lock+0x2b/0x80
   ? wbc_attach_and_unlock_inode+0x12c/0x290
   ? wbc_attach_and_unlock_inode+0x12c/0x290
   __writeback_single_inode+0x5c/0x4c0
   ? do_raw_spin_unlock+0x49/0xb0
   writeback_sb_inodes+0x22c/0x560
   __writeback_inodes_wb+0x4c/0xe0
   wb_writeback+0x1d6/0x3f0
   wb_workfn+0x334/0x520
   process_one_work+0x1ee/0x570
   ? lock_is_held_type+0xc6/0x130
   worker_thread+0x1d1/0x3b0
   ? __pfx_worker_thread+0x10/0x10
   kthread+0xee/0x120
   ? __pfx_kthread+0x10/0x10
   ret_from_fork+0x30/0x50
   ? __pfx_kthread+0x10/0x10
   ret_from_fork_asm+0x1a/0x30
   </TASK>
  Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
  CR2: 0000000000000020

* Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios

btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
called last among split bios. In that case, btrfs_orig_write_end_io() sets
the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [2].
Otherwise, the increased orig_bio's bioc->error is not checked by anyone
and return BLK_STS_OK to the upper layer.

[2] Actually, this is not true. Because we only increases orig_bioc->errors
by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
is still not met if only one split btrfs_bio fails.

* Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios

In contrast to the above case, btrfs_bbio_propagate_error() is not working
well if un-mirrored orig_bbio is completed last. It sets
orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
the upper layer would not know the failure.

* Solution

Considering the above cases, we can only save the error status in the
orig_bbio (remaining part after split) itself as it is always
available. Also, the saved error status should be propagated when all the
split btrfs_bios are finished (i.e, bbio->pending_ios == 0).

This commit introduces "status" to btrfs_bbio and saves the first error of
split bios to original btrfs_bio's "status" variable. When all the split
bios are finished, the saved status is loaded into original btrfs_bio's
status.

With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
dereference anymore.

Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ruohan Lan <ruohanlan@aliyun.com>
---
 fs/btrfs/bio.c | 37 +++++++++++++------------------------
 fs/btrfs/bio.h |  3 +++
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 36b5ec9701d2..c074b3623e2c 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -51,6 +51,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 	bbio->end_io = end_io;
 	bbio->private = private;
 	atomic_set(&bbio->pending_ios, 1);
+	WRITE_ONCE(bbio->status, BLK_STS_OK);
 }
 
 /*
@@ -122,41 +123,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
 	}
 }
 
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
-				       struct btrfs_bio *orig_bbio)
-{
-	/*
-	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
-	 * just blindly propagate a write failure here.  Instead increment the
-	 * error count in the original I/O context so that it is guaranteed to
-	 * be larger than the error tolerance.
-	 */
-	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
-		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
-		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
-		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
-	} else {
-		orig_bbio->bio.bi_status = bbio->bio.bi_status;
-	}
-}
-
 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 {
 	bbio->bio.bi_status = status;
 	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
 		struct btrfs_bio *orig_bbio = bbio->private;
 
-		if (bbio->bio.bi_status)
-			btrfs_bbio_propagate_error(bbio, orig_bbio);
 		btrfs_cleanup_bio(bbio);
 		bbio = orig_bbio;
 	}
 
-	if (atomic_dec_and_test(&bbio->pending_ios))
+	/*
+	 * At this point, bbio always points to the original btrfs_bio. Save
+	 * the first error in it.
+	 */
+	if (status != BLK_STS_OK)
+		cmpxchg(&bbio->status, BLK_STS_OK, status);
+
+	if (atomic_dec_and_test(&bbio->pending_ios)) {
+		/* Load split bio's error which might be set above. */
+		if (status == BLK_STS_OK)
+			bbio->bio.bi_status = READ_ONCE(bbio->status);
 		__btrfs_bio_end_io(bbio);
+	}
 }
 
 static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..264dda60a1cb 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -77,6 +77,9 @@ struct btrfs_bio {
 	/* File system that this I/O operates on. */
 	struct btrfs_fs_info *fs_info;
 
+	/* Save the first error status of split bio. */
+	blk_status_t status;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-04-17  2:51 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-17  2:51 [PATCH 6.6.y 0/2] backport to fix error propagation of split bios Ruohan Lan
2026-04-17  2:51 ` [PATCH 6.6.y 1/2] btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io() Ruohan Lan
2026-04-17  2:51 ` [PATCH 6.6.y 2/2] btrfs: fix error propagation of split bios Ruohan Lan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox