public inbox for linux-raid@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] md/raid5: Fix UAF on IO across the reshape position
@ 2026-03-23 22:58 Benjamin Marzinski
  2026-03-30 15:44 ` Xiao Ni
  0 siblings, 1 reply; 8+ messages in thread
From: Benjamin Marzinski @ 2026-03-23 22:58 UTC (permalink / raw)
  To: Song Liu, Yu Kuai, Li Nan; +Cc: linux-raid, dm-devel, Nigel Croxon, Xiao Ni

If make_stripe_request() returns STRIPE_WAIT_RESHAPE,
raid5_make_request() will free the cloned bio. But raid5_make_request()
can call make_stripe_request() multiple times, writing to the various
stripes. If that bio got added to the toread or towrite lists of a
stripe disk in an earlier call to make_stripe_request(), then it's not
safe to just free the bio if a later part of it is found to cross the
reshape position. Doing so can lead to a UAF error, when bio_endio()
is called on the bio for the earlier stripes.

Instead, raid5_make_request() needs to wait until all parts of the bio
have called bio_endio(). To do this, bios that cross the reshape
position while the reshape can't make progress are flagged as needing a
retry, and mddev tracks the number of bios needing a retry which have
not yet completed. When raid5_make_request() has a bio that failed
make_stripe_request() with STRIPE_WAIT_RESHAPE, it waits for this
counter to reach zero. When the bio_endio() is called for the last time
on a bio needing a retry, it decrements mddev's count of outstanding
bios needing a retry. This guarantees that raid5_make_request() doesn't
return until the cloned bio needing a retry for io across the reshape
boundary is safely cleaned up.

There is a simple reproducer available at [1]. Compile the kernel with
KASAN for more useful reporting when the error is triggered (this is not
necessary to see the bug).

[1] https://gist.github.com/bmarzins/e48598824305cf2171289e47d7241fa5

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---

I've tested this for regressions with the lvm2-testsuite raid tests. I
have not run any md-specific tests on it.


 drivers/md/md.c    | 30 +++++++++---------------------
 drivers/md/md.h    |  5 ++++-
 drivers/md/raid5.c |  8 +++++++-
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3ce6f9e9d38e..5ec116b9da32 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -776,9 +776,11 @@ int mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->sync_seq, 0);
+	atomic_set(&mddev->pending_retry_bios, 0);
 	spin_lock_init(&mddev->lock);
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
+	init_waitqueue_head(&mddev->retry_bios_wait);
 	mddev->reshape_position = MaxSector;
 	mddev->reshape_backwards = 0;
 	mddev->last_sync_action = ACTION_IDLE;
@@ -9218,6 +9220,7 @@ static void md_end_clone_io(struct bio *bio)
 	struct md_io_clone *md_io_clone = bio->bi_private;
 	struct bio *orig_bio = md_io_clone->orig_bio;
 	struct mddev *mddev = md_io_clone->mddev;
+	unsigned int must_retry = md_io_clone->must_retry;
 
 	if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
 		md_bitmap_end(mddev, md_io_clone);
@@ -9229,7 +9232,11 @@ static void md_end_clone_io(struct bio *bio)
 		bio_end_io_acct(orig_bio, md_io_clone->start_time);
 
 	bio_put(bio);
-	bio_endio(orig_bio);
+	if (unlikely(must_retry)) {
+		if (atomic_dec_and_test(&mddev->pending_retry_bios))
+			wake_up(&mddev->retry_bios_wait);
+	} else
+		bio_endio(orig_bio);
 	percpu_ref_put(&mddev->active_io);
 }
 
@@ -9243,6 +9250,7 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
 	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
 	md_io_clone->orig_bio = *bio;
 	md_io_clone->mddev = mddev;
+	md_io_clone->must_retry = 0;
 	if (blk_queue_io_stat(bdev->bd_disk->queue))
 		md_io_clone->start_time = bio_start_io_acct(*bio);
 
@@ -9265,26 +9273,6 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
 }
 EXPORT_SYMBOL_GPL(md_account_bio);
 
-void md_free_cloned_bio(struct bio *bio)
-{
-	struct md_io_clone *md_io_clone = bio->bi_private;
-	struct bio *orig_bio = md_io_clone->orig_bio;
-	struct mddev *mddev = md_io_clone->mddev;
-
-	if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
-		md_bitmap_end(mddev, md_io_clone);
-
-	if (bio->bi_status && !orig_bio->bi_status)
-		orig_bio->bi_status = bio->bi_status;
-
-	if (md_io_clone->start_time)
-		bio_end_io_acct(orig_bio, md_io_clone->start_time);
-
-	bio_put(bio);
-	percpu_ref_put(&mddev->active_io);
-}
-EXPORT_SYMBOL_GPL(md_free_cloned_bio);
-
 /* md_allow_write(mddev)
  * Calling this ensures that the array is marked 'active' so that writes
  * may proceed without blocking.  It is important to call this before
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ac84289664cd..49a231f11676 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -626,6 +626,9 @@ struct mddev {
 
 	/* The sequence number for sync thread */
 	atomic_t sync_seq;
+
+	wait_queue_head_t		retry_bios_wait;
+	atomic_t			pending_retry_bios;
 };
 
 enum recovery_flags {
@@ -877,6 +880,7 @@ struct md_io_clone {
 	sector_t	offset;
 	unsigned long	sectors;
 	enum stat_group	rw;
+	unsigned int	must_retry;
 	struct bio	bio_clone;
 };
 
@@ -917,7 +921,6 @@ extern void md_finish_reshape(struct mddev *mddev);
 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 			struct bio *bio, sector_t start, sector_t size);
 void md_account_bio(struct mddev *mddev, struct bio **bio);
-void md_free_cloned_bio(struct bio *bio);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
 void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a8e8d431071b..fb78a757f2fd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6217,7 +6217,13 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 
 	mempool_free(ctx, conf->ctx_pool);
 	if (res == STRIPE_WAIT_RESHAPE) {
-		md_free_cloned_bio(bi);
+		struct md_io_clone *md_io_clone = bi->bi_private;
+
+		md_io_clone->must_retry = 1;
+		atomic_inc(&mddev->pending_retry_bios);
+		bio_endio(bi);
+		wait_event(mddev->retry_bios_wait,
+			   atomic_read(&mddev->pending_retry_bios)==0);
 		return false;
 	}
 
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-04-07 12:23 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-23 22:58 [RFC PATCH] md/raid5: Fix UAF on IO across the reshape position Benjamin Marzinski
2026-03-30 15:44 ` Xiao Ni
2026-03-30 19:16   ` Benjamin Marzinski
2026-03-31  2:24     ` Xiao Ni
2026-03-31 16:36       ` Benjamin Marzinski
2026-04-01  0:22         ` Xiao Ni
2026-04-07  6:24           ` Yu Kuai
2026-04-07 12:23             ` Xiao Ni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox