[PATCH 0/7] cleanup and bugfix of sync

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/7] cleanup and bugfix of sync
@ 2025-09-17  9:35 linan666
  2025-09-17  9:35 ` [PATCH 1/7] md: factor error handling out of md_done_sync into helper linan666
                   ` (6 more replies)
  0 siblings, 7 replies; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

Li Nan (7):
  md: factor error handling out of md_done_sync into helper
  md: mark rdev Faulty when badblocks setting fails
  md: cleanup MD_RECOVERY_ERROR flag
  md: factor out sync completion update into helper
  md/raid10: fix any_working flag handling in raid10_sync_request
  md/raid10: cleanup skip handling in raid10_sync_request
  md: remove recovery_disabled

 drivers/md/md.h     |  11 +--
 drivers/md/raid1.h  |   5 --
 drivers/md/raid10.h |   5 --
 drivers/md/raid5.h  |   1 -
 drivers/md/md.c     | 122 ++++++++++++++++----------------
 drivers/md/raid1.c  |  75 ++++++++------------
 drivers/md/raid10.c | 168 ++++++++++++++------------------------------
 drivers/md/raid5.c  |  40 ++++-------
 8 files changed, 160 insertions(+), 267 deletions(-)

-- 
2.39.2


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/7] md: factor error handling out of md_done_sync into helper
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
@ 2025-09-17  9:35 ` linan666
  2025-09-22  1:24   ` Yu Kuai
  2025-09-17  9:35 ` [PATCH 2/7] md: mark rdev Faulty when badblocks setting fails linan666
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

The 'ok' parameter in md_done_sync() is redundant for most callers that
always pass 'true'. Factor error handling logic into a separate helper
function md_sync_error() to eliminate unnecessary parameter passing and
improve code clarity.

No functional changes introduced.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.h     |  3 ++-
 drivers/md/md.c     | 17 ++++++++++-------
 drivers/md/raid1.c  | 17 ++++++++---------
 drivers/md/raid10.c | 11 ++++++-----
 drivers/md/raid5.c  | 14 ++++++++------
 5 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1979c2d4fe89..ba567b63afd3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -904,7 +904,8 @@ extern const char *md_sync_action_name(enum sync_action action);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
 extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
-extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
+extern void md_done_sync(struct mddev *mddev, int blocks);
+extern void md_sync_error(struct mddev *mddev);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1de550108756..1795f725f7fb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8937,20 +8937,23 @@ static bool is_mddev_idle(struct mddev *mddev, int init)
 	return idle;
 }
 
-void md_done_sync(struct mddev *mddev, int blocks, int ok)
+void md_done_sync(struct mddev *mddev, int blocks)
 {
 	/* another "blocks" (512byte) blocks have been synced */
 	atomic_sub(blocks, &mddev->recovery_active);
 	wake_up(&mddev->recovery_wait);
-	if (!ok) {
-		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
-		md_wakeup_thread(mddev->thread);
-		// stop recovery, signal do_sync ....
-	}
 }
 EXPORT_SYMBOL(md_done_sync);
 
+void md_sync_error(struct mddev *mddev)
+{
+	// stop recovery, signal do_sync ....
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+	set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+}
+EXPORT_SYMBOL(md_sync_error);
+
 /* md_write_start(mddev, bi)
  * If we need to update some array metadata (e.g. 'active' flag
  * in superblock) before writing, schedule a superblock update
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0e792b9bfff8..397b3a2eaee4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2074,7 +2074,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
 	} while (sectors_to_go > 0);
 }
 
-static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+static void put_sync_write_buf(struct r1bio *r1_bio)
 {
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		struct mddev *mddev = r1_bio->mddev;
@@ -2085,20 +2085,19 @@ static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
 			reschedule_retry(r1_bio);
 		else {
 			put_buf(r1_bio);
-			md_done_sync(mddev, s, uptodate);
+			md_done_sync(mddev, s);
 		}
 	}
 }
 
 static void end_sync_write(struct bio *bio)
 {
-	int uptodate = !bio->bi_status;
 	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
 
-	if (!uptodate) {
+	if (bio->bi_status) {
 		abort_sync_write(mddev, r1_bio);
 		set_bit(WriteErrorSeen, &rdev->flags);
 		if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -2111,7 +2110,7 @@ static void end_sync_write(struct bio *bio)
 		set_bit(R1BIO_MadeGood, &r1_bio->state);
 	}
 
-	put_sync_write_buf(r1_bio, uptodate);
+	put_sync_write_buf(r1_bio);
 }
 
 static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2361,8 +2360,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
 		    !fix_sync_read_error(r1_bio)) {
 			conf->recovery_disabled = mddev->recovery_disabled;
-			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			md_done_sync(mddev, r1_bio->sectors, 0);
+			md_done_sync(mddev, r1_bio->sectors);
+			md_sync_error(mddev);
 			put_buf(r1_bio);
 			return;
 		}
@@ -2397,7 +2396,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 		submit_bio_noacct(wbio);
 	}
 
-	put_sync_write_buf(r1_bio, 1);
+	put_sync_write_buf(r1_bio);
 }
 
 /*
@@ -2588,7 +2587,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 		}
 	}
 	put_buf(r1_bio);
-	md_done_sync(conf->mddev, s, 1);
+	md_done_sync(conf->mddev, s);
 }
 
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2411399a7352..2899fd1ecc57 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2282,7 +2282,7 @@ static void end_sync_request(struct r10bio *r10_bio)
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
-			md_done_sync(mddev, s, 1);
+			md_done_sync(mddev, s);
 			break;
 		} else {
 			struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
@@ -2458,7 +2458,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
 done:
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_done_sync(mddev, r10_bio->sectors, 1);
+		md_done_sync(mddev, r10_bio->sectors);
 		put_buf(r10_bio);
 	}
 }
@@ -3763,7 +3763,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		/* pretend they weren't skipped, it makes
 		 * no important difference in this case
 		 */
-		md_done_sync(mddev, sectors_skipped, 1);
+		md_done_sync(mddev, sectors_skipped);
 
 	return sectors_skipped + nr_sectors;
  giveup:
@@ -4917,7 +4917,8 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		if (handle_reshape_read_error(mddev, r10_bio) < 0) {
 			/* Reshape has been aborted */
-			md_done_sync(mddev, r10_bio->sectors, 0);
+			md_done_sync(mddev, r10_bio->sectors);
+			md_sync_error(mddev);
 			return;
 		}
 
@@ -5075,7 +5076,7 @@ static void end_reshape_request(struct r10bio *r10_bio)
 {
 	if (!atomic_dec_and_test(&r10_bio->remaining))
 		return;
-	md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
+	md_done_sync(r10_bio->mddev, r10_bio->sectors);
 	bio_put(r10_bio->master_bio);
 	put_buf(r10_bio);
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5112658ef5f6..b09265fb872a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3723,11 +3723,13 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 						   RAID5_STRIPE_SECTORS(conf), 0))
 				abort = 1;
 		}
-		if (abort)
-			conf->recovery_disabled =
-				conf->mddev->recovery_disabled;
 	}
-	md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
+	md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
+
+	if (abort) {
+		conf->recovery_disabled = conf->mddev->recovery_disabled;
+		md_sync_error(conf->mddev);
+	}
 }
 
 static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -5156,7 +5158,7 @@ static void handle_stripe(struct stripe_head *sh)
 	if ((s.syncing || s.replacing) && s.locked == 0 &&
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	    test_bit(STRIPE_INSYNC, &sh->state)) {
-		md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
+		md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
 		clear_bit(STRIPE_SYNCING, &sh->state);
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
 			wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
@@ -5223,7 +5225,7 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_reshape);
-		md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
+		md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
 	}
 
 	if (s.expanding && s.locked == 0 &&
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/7] md: factor error handling out of md_done_sync into helper
  2025-09-17  9:35 ` [PATCH 1/7] md: factor error handling out of md_done_sync into helper linan666
@ 2025-09-22  1:24   ` Yu Kuai
  0 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2025-09-22  1:24 UTC (permalink / raw)
  To: linan666, song, neil, namhyung
  Cc: linux-raid, linux-kernel, yangerkun, yi.zhang, yukuai (C)

在 2025/09/17 17:35, linan666@huaweicloud.com 写道:
> From: Li Nan<linan122@huawei.com>
> 
> The 'ok' parameter in md_done_sync() is redundant for most callers that
> always pass 'true'. Factor error handling logic into a separate helper
> function md_sync_error() to eliminate unnecessary parameter passing and
> improve code clarity.
> 
> No functional changes introduced.
> 
> Signed-off-by: Li Nan<linan122@huawei.com>
> ---
>   drivers/md/md.h     |  3 ++-
>   drivers/md/md.c     | 17 ++++++++++-------
>   drivers/md/raid1.c  | 17 ++++++++---------
>   drivers/md/raid10.c | 11 ++++++-----
>   drivers/md/raid5.c  | 14 ++++++++------
>   5 files changed, 34 insertions(+), 28 deletions(-)

LGTM
Reviewed-by: Yu Kuai <yukuai3@huawei.com>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/7] md: mark rdev Faulty when badblocks setting fails
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
  2025-09-17  9:35 ` [PATCH 1/7] md: factor error handling out of md_done_sync into helper linan666
@ 2025-09-17  9:35 ` linan666
  2025-09-22  2:01   ` Li Nan
  2025-09-17  9:35 ` [PATCH 3/7] md: cleanup MD_RECOVERY_ERROR flag linan666
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

Currently when sync read fails and badblocks set fails (exceeding
512 limit), rdev isn't immediately marked Faulty. Instead
'recovery_disabled' is set and non-In_sync rdevs are removed later.
This preserves array availability if bad regions aren't read, but bad
sectors might be read by users before rdev removal. This occurs due
to incorrect resync/recovery_offset updates that include these bad
sectors.

When badblocks exceed 512, keeping the disk provides little benefit
while adding complexity. Prompt disk replacement is more important.
Therefore when badblocks set fails, directly call md_error to mark rdev
Faulty immediately, preventing potential data access issues.

After this change, cleanup of offset update logic and 'recovery_disabled'
handling will follow.

Fixes: 5e5702898e93 ("md/raid10: Handle read errors during recovery better.")
Fixes: 3a9f28a5117e ("md/raid1: improve handling of read failure during recovery.")
Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.c     |  8 ++++++-
 drivers/md/raid1.c  | 41 +++++++++++++++--------------------
 drivers/md/raid10.c | 53 ++++++++++++++++++++-------------------------
 drivers/md/raid5.c  | 22 ++++++++-----------
 4 files changed, 57 insertions(+), 67 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1795f725f7fb..05b6b3145648 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -10245,8 +10245,14 @@ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 	else
 		s += rdev->data_offset;
 
-	if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
+	if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) {
+		/*
+		 * Mark the disk as Faulty when setting badblocks fails,
+		 * otherwise, bad sectors may be read.
+		 */
+		md_error(mddev, rdev);
 		return false;
+	}
 
 	/* Make sure they get written out promptly */
 	if (test_bit(ExternalBbl, &rdev->flags))
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 397b3a2eaee4..f7238e9f35e5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2127,8 +2127,7 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
 				rdev->mddev->recovery);
 	}
 	/* need to record an error - either for the block or the device */
-	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
-		md_error(rdev->mddev, rdev);
+	rdev_set_badblocks(rdev, sector, sectors, 0);
 	return 0;
 }
 
@@ -2453,8 +2452,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 		if (!success) {
 			/* Cannot read from anywhere - mark it bad */
 			struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
-			if (!rdev_set_badblocks(rdev, sect, s, 0))
-				md_error(mddev, rdev);
+			rdev_set_badblocks(rdev, sect, s, 0);
 			break;
 		}
 		/* write it back and re-read */
@@ -2498,7 +2496,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	}
 }
 
-static bool narrow_write_error(struct r1bio *r1_bio, int i)
+static void narrow_write_error(struct r1bio *r1_bio, int i)
 {
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
@@ -2519,10 +2517,9 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
 	sector_t sector;
 	int sectors;
 	int sect_to_write = r1_bio->sectors;
-	bool ok = true;
 
 	if (rdev->badblocks.shift < 0)
-		return false;
+		return;
 
 	block_sectors = roundup(1 << rdev->badblocks.shift,
 				bdev_logical_block_size(rdev->bdev) >> 9);
@@ -2553,18 +2550,21 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
 		bio_trim(wbio, sector - r1_bio->sector, sectors);
 		wbio->bi_iter.bi_sector += rdev->data_offset;
 
-		if (submit_bio_wait(wbio) < 0)
-			/* failure! */
-			ok = rdev_set_badblocks(rdev, sector,
-						sectors, 0)
-				&& ok;
+		if (submit_bio_wait(wbio) < 0 &&
+		    !rdev_set_badblocks(rdev, sector, sectors, 0)) {
+			/*
+			 * Badblocks set failed, disk marked Faulty.
+			 * No further operations needed.
+			 */
+			bio_put(wbio);
+			break;
+		}
 
 		bio_put(wbio);
 		sect_to_write -= sectors;
 		sector += sectors;
 		sectors = block_sectors;
 	}
-	return ok;
 }
 
 static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2577,14 +2577,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 		if (bio->bi_end_io == NULL)
 			continue;
 		if (!bio->bi_status &&
-		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+		    test_bit(R1BIO_MadeGood, &r1_bio->state))
 			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
-		}
 		if (bio->bi_status &&
-		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
-			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
-				md_error(conf->mddev, rdev);
-		}
+		    test_bit(R1BIO_WriteError, &r1_bio->state))
+			rdev_set_badblocks(rdev, r1_bio->sector, s, 0);
 	}
 	put_buf(r1_bio);
 	md_done_sync(conf->mddev, s);
@@ -2608,10 +2605,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 			 * errors.
 			 */
 			fail = true;
-			if (!narrow_write_error(r1_bio, m))
-				md_error(conf->mddev,
-					 conf->mirrors[m].rdev);
-				/* an I/O failed, we can't clear the bitmap */
+			narrow_write_error(r1_bio, m);
+			/* an I/O failed, we can't clear the bitmap */
 			rdev_dec_pending(conf->mirrors[m].rdev,
 					 conf->mddev);
 		}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2899fd1ecc57..4c58c32f7c27 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2610,8 +2610,7 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 				&rdev->mddev->recovery);
 	}
 	/* need to record an error - either for the block or the device */
-	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
-		md_error(rdev->mddev, rdev);
+	rdev_set_badblocks(rdev, sector, sectors, 0);
 	return 0;
 }
 
@@ -2692,7 +2691,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 				    r10_bio->devs[slot].addr
 				    + sect,
 				    s, 0)) {
-				md_error(mddev, rdev);
 				r10_bio->devs[slot].bio
 					= IO_BLOCKED;
 			}
@@ -2779,7 +2777,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 	}
 }
 
-static bool narrow_write_error(struct r10bio *r10_bio, int i)
+static void narrow_write_error(struct r10bio *r10_bio, int i)
 {
 	struct bio *bio = r10_bio->master_bio;
 	struct mddev *mddev = r10_bio->mddev;
@@ -2800,10 +2798,9 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
 	sector_t sector;
 	int sectors;
 	int sect_to_write = r10_bio->sectors;
-	bool ok = true;
 
 	if (rdev->badblocks.shift < 0)
-		return false;
+		return;
 
 	block_sectors = roundup(1 << rdev->badblocks.shift,
 				bdev_logical_block_size(rdev->bdev) >> 9);
@@ -2826,18 +2823,21 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
 				   choose_data_offset(r10_bio, rdev);
 		wbio->bi_opf = REQ_OP_WRITE;
 
-		if (submit_bio_wait(wbio) < 0)
-			/* Failure! */
-			ok = rdev_set_badblocks(rdev, wsector,
-						sectors, 0)
-				&& ok;
+		if (submit_bio_wait(wbio) < 0 &&
+		    rdev_set_badblocks(rdev, wsector, sectors, 0)) {
+			/*
+			 * Badblocks set failed, disk marked Faulty.
+			 * No further operations needed.
+			 */
+			bio_put(wbio);
+			break;
+		}
 
 		bio_put(wbio);
 		sect_to_write -= sectors;
 		sector += sectors;
 		sectors = block_sectors;
 	}
-	return ok;
 }
 
 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
@@ -2897,35 +2897,29 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			if (r10_bio->devs[m].bio == NULL ||
 				r10_bio->devs[m].bio->bi_end_io == NULL)
 				continue;
-			if (!r10_bio->devs[m].bio->bi_status) {
+			if (!r10_bio->devs[m].bio->bi_status)
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
-			} else {
-				if (!rdev_set_badblocks(
-					    rdev,
-					    r10_bio->devs[m].addr,
-					    r10_bio->sectors, 0))
-					md_error(conf->mddev, rdev);
-			}
+			else
+				rdev_set_badblocks(rdev,
+						   r10_bio->devs[m].addr,
+						   r10_bio->sectors, 0);
 			rdev = conf->mirrors[dev].replacement;
 			if (r10_bio->devs[m].repl_bio == NULL ||
 				r10_bio->devs[m].repl_bio->bi_end_io == NULL)
 				continue;
 
-			if (!r10_bio->devs[m].repl_bio->bi_status) {
+			if (!r10_bio->devs[m].repl_bio->bi_status)
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
-			} else {
-				if (!rdev_set_badblocks(
-					    rdev,
-					    r10_bio->devs[m].addr,
-					    r10_bio->sectors, 0))
-					md_error(conf->mddev, rdev);
-			}
+			else
+				rdev_set_badblocks(rdev,
+						   r10_bio->devs[m].addr,
+						   r10_bio->sectors, 0);
 		}
 		put_buf(r10_bio);
 	} else {
@@ -2942,8 +2936,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_dec_pending(rdev, conf->mddev);
 			} else if (bio != NULL && bio->bi_status) {
 				fail = true;
-				if (!narrow_write_error(r10_bio, m))
-					md_error(conf->mddev, rdev);
+				narrow_write_error(r10_bio, m);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 			bio = r10_bio->devs[m].repl_bio;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b09265fb872a..70106abf2110 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2817,11 +2817,9 @@ static void raid5_end_read_request(struct bio * bi)
 		else {
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
-			if (!(set_bad
-			      && test_bit(In_sync, &rdev->flags)
-			      && rdev_set_badblocks(
-				      rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
-				md_error(conf->mddev, rdev);
+			if (!(set_bad && test_bit(In_sync, &rdev->flags)))
+				rdev_set_badblocks(rdev, sh->sector,
+						   RAID5_STRIPE_SECTORS(conf), 0);
 		}
 	}
 	rdev_dec_pending(rdev, conf->mddev);
@@ -3599,11 +3597,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			else
 				rdev = NULL;
 			if (rdev) {
-				if (!rdev_set_badblocks(
-					    rdev,
-					    sh->sector,
-					    RAID5_STRIPE_SECTORS(conf), 0))
-					md_error(conf->mddev, rdev);
+				rdev_set_badblocks(rdev,
+						   sh->sector,
+						   RAID5_STRIPE_SECTORS(conf),
+						   0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 		}
@@ -5254,9 +5251,8 @@ static void handle_stripe(struct stripe_head *sh)
 			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
 				/* We own a safe reference to the rdev */
 				rdev = conf->disks[i].rdev;
-				if (!rdev_set_badblocks(rdev, sh->sector,
-							RAID5_STRIPE_SECTORS(conf), 0))
-					md_error(conf->mddev, rdev);
+				rdev_set_badblocks(rdev, sh->sector,
+						   RAID5_STRIPE_SECTORS(conf), 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/7] md: mark rdev Faulty when badblocks setting fails
  2025-09-17  9:35 ` [PATCH 2/7] md: mark rdev Faulty when badblocks setting fails linan666
@ 2025-09-22  2:01   ` Li Nan
  0 siblings, 0 replies; 13+ messages in thread
From: Li Nan @ 2025-09-22  2:01 UTC (permalink / raw)
  To: linan666, song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, yangerkun, yi.zhang



在 2025/9/17 17:35, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
> 
> Currently when sync read fails and badblocks set fails (exceeding
> 512 limit), rdev isn't immediately marked Faulty. Instead
> 'recovery_disabled' is set and non-In_sync rdevs are removed later.
> This preserves array availability if bad regions aren't read, but bad
> sectors might be read by users before rdev removal. This occurs due
> to incorrect resync/recovery_offset updates that include these bad
> sectors.
> 
> When badblocks exceed 512, keeping the disk provides little benefit
> while adding complexity. Prompt disk replacement is more important.
> Therefore when badblocks set fails, directly call md_error to mark rdev
> Faulty immediately, preventing potential data access issues.
> 
> After this change, cleanup of offset update logic and 'recovery_disabled'
> handling will follow.
> 
> Fixes: 5e5702898e93 ("md/raid10: Handle read errors during recovery better.")
> Fixes: 3a9f28a5117e ("md/raid1: improve handling of read failure during recovery.")
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/md.c     |  8 ++++++-
>   drivers/md/raid1.c  | 41 +++++++++++++++--------------------
>   drivers/md/raid10.c | 53 ++++++++++++++++++++-------------------------
>   drivers/md/raid5.c  | 22 ++++++++-----------
>   4 files changed, 57 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 1795f725f7fb..05b6b3145648 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -10245,8 +10245,14 @@ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
>   	else
>   		s += rdev->data_offset;
>   
> -	if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
> +	if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) {
> +		/*
> +		 * Mark the disk as Faulty when setting badblocks fails,
> +		 * otherwise, bad sectors may be read.
> +		 */
> +		md_error(mddev, rdev);
>   		return false;
> +	}
>   
>   	/* Make sure they get written out promptly */
>   	if (test_bit(ExternalBbl, &rdev->flags))
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 397b3a2eaee4..f7238e9f35e5 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -2127,8 +2127,7 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
>   				rdev->mddev->recovery);
>   	}
>   	/* need to record an error - either for the block or the device */
> -	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
> -		md_error(rdev->mddev, rdev);
> +	rdev_set_badblocks(rdev, sector, sectors, 0);
>   	return 0;
>   }
>   
> @@ -2453,8 +2452,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
>   		if (!success) {
>   			/* Cannot read from anywhere - mark it bad */
>   			struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
> -			if (!rdev_set_badblocks(rdev, sect, s, 0))
> -				md_error(mddev, rdev);
> +			rdev_set_badblocks(rdev, sect, s, 0);
>   			break;
>   		}
>   		/* write it back and re-read */
> @@ -2498,7 +2496,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
>   	}
>   }
>   
> -static bool narrow_write_error(struct r1bio *r1_bio, int i)
> +static void narrow_write_error(struct r1bio *r1_bio, int i)
>   {
>   	struct mddev *mddev = r1_bio->mddev;
>   	struct r1conf *conf = mddev->private;
> @@ -2519,10 +2517,9 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
>   	sector_t sector;
>   	int sectors;
>   	int sect_to_write = r1_bio->sectors;
> -	bool ok = true;
>   
>   	if (rdev->badblocks.shift < 0)
> -		return false;
> +		return;
>   
>   	block_sectors = roundup(1 << rdev->badblocks.shift,
>   				bdev_logical_block_size(rdev->bdev) >> 9);
> @@ -2553,18 +2550,21 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
>   		bio_trim(wbio, sector - r1_bio->sector, sectors);
>   		wbio->bi_iter.bi_sector += rdev->data_offset;
>   
> -		if (submit_bio_wait(wbio) < 0)
> -			/* failure! */
> -			ok = rdev_set_badblocks(rdev, sector,
> -						sectors, 0)
> -				&& ok;
> +		if (submit_bio_wait(wbio) < 0 &&
> +		    !rdev_set_badblocks(rdev, sector, sectors, 0)) {
> +			/*
> +			 * Badblocks set failed, disk marked Faulty.
> +			 * No further operations needed.
> +			 */
> +			bio_put(wbio);
> +			break;
> +		}
>   
>   		bio_put(wbio);
>   		sect_to_write -= sectors;
>   		sector += sectors;
>   		sectors = block_sectors;
>   	}
> -	return ok;
>   }
>   
>   static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
> @@ -2577,14 +2577,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
>   		if (bio->bi_end_io == NULL)
>   			continue;
>   		if (!bio->bi_status &&
> -		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
> +		    test_bit(R1BIO_MadeGood, &r1_bio->state))
>   			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
> -		}
>   		if (bio->bi_status &&
> -		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
> -			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
> -				md_error(conf->mddev, rdev);
> -		}
> +		    test_bit(R1BIO_WriteError, &r1_bio->state))
> +			rdev_set_badblocks(rdev, r1_bio->sector, s, 0);
>   	}
>   	put_buf(r1_bio);
>   	md_done_sync(conf->mddev, s);
> @@ -2608,10 +2605,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
>   			 * errors.
>   			 */
>   			fail = true;
> -			if (!narrow_write_error(r1_bio, m))
> -				md_error(conf->mddev,
> -					 conf->mirrors[m].rdev);
> -				/* an I/O failed, we can't clear the bitmap */
> +			narrow_write_error(r1_bio, m);
> +			/* an I/O failed, we can't clear the bitmap */
>   			rdev_dec_pending(conf->mirrors[m].rdev,
>   					 conf->mddev);
>   		}
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 2899fd1ecc57..4c58c32f7c27 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -2610,8 +2610,7 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
>   				&rdev->mddev->recovery);
>   	}
>   	/* need to record an error - either for the block or the device */
> -	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
> -		md_error(rdev->mddev, rdev);
> +	rdev_set_badblocks(rdev, sector, sectors, 0);
>   	return 0;
>   }
>   
> @@ -2692,7 +2691,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
>   				    r10_bio->devs[slot].addr
>   				    + sect,
>   				    s, 0)) {
> -				md_error(mddev, rdev);
>   				r10_bio->devs[slot].bio
>   					= IO_BLOCKED;
>   			}
> @@ -2779,7 +2777,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
>   	}
>   }
>   
> -static bool narrow_write_error(struct r10bio *r10_bio, int i)
> +static void narrow_write_error(struct r10bio *r10_bio, int i)
>   {
>   	struct bio *bio = r10_bio->master_bio;
>   	struct mddev *mddev = r10_bio->mddev;
> @@ -2800,10 +2798,9 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
>   	sector_t sector;
>   	int sectors;
>   	int sect_to_write = r10_bio->sectors;
> -	bool ok = true;
>   
>   	if (rdev->badblocks.shift < 0)
> -		return false;
> +		return;
>   

A direct return here is incorrect. This change ought to be removed after
Kenta's cleanup. I will fix it after that cleanup merged.

https://lore.kernel.org/all/cbce67bc-74c6-0c99-fdba-48cd8aa27dda@huaweicloud.com/

>   	block_sectors = roundup(1 << rdev->badblocks.shift,
>   				bdev_logical_block_size(rdev->bdev) >> 9);
> @@ -2826,18 +2823,21 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
>   				   choose_data_offset(r10_bio, rdev);
>   		wbio->bi_opf = REQ_OP_WRITE;
>   
> -		if (submit_bio_wait(wbio) < 0)
> -			/* Failure! */
> -			ok = rdev_set_badblocks(rdev, wsector,
> -						sectors, 0)
> -				&& ok;
> +		if (submit_bio_wait(wbio) < 0 &&
> +		    rdev_set_badblocks(rdev, wsector, sectors, 0)) {
> +			/*
> +			 * Badblocks set failed, disk marked Faulty.
> +			 * No further operations needed.
> +			 */
> +			bio_put(wbio);
> +			break;
> +		}
>   
>   		bio_put(wbio);
>   		sect_to_write -= sectors;
>   		sector += sectors;
>   		sectors = block_sectors;
>   	}
> -	return ok;
>   }
>   
>   static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
> @@ -2897,35 +2897,29 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
>   			if (r10_bio->devs[m].bio == NULL ||
>   				r10_bio->devs[m].bio->bi_end_io == NULL)
>   				continue;
> -			if (!r10_bio->devs[m].bio->bi_status) {
> +			if (!r10_bio->devs[m].bio->bi_status)
>   				rdev_clear_badblocks(
>   					rdev,
>   					r10_bio->devs[m].addr,
>   					r10_bio->sectors, 0);
> -			} else {
> -				if (!rdev_set_badblocks(
> -					    rdev,
> -					    r10_bio->devs[m].addr,
> -					    r10_bio->sectors, 0))
> -					md_error(conf->mddev, rdev);
> -			}
> +			else
> +				rdev_set_badblocks(rdev,
> +						   r10_bio->devs[m].addr,
> +						   r10_bio->sectors, 0);
>   			rdev = conf->mirrors[dev].replacement;
>   			if (r10_bio->devs[m].repl_bio == NULL ||
>   				r10_bio->devs[m].repl_bio->bi_end_io == NULL)
>   				continue;
>   
> -			if (!r10_bio->devs[m].repl_bio->bi_status) {
> +			if (!r10_bio->devs[m].repl_bio->bi_status)
>   				rdev_clear_badblocks(
>   					rdev,
>   					r10_bio->devs[m].addr,
>   					r10_bio->sectors, 0);
> -			} else {
> -				if (!rdev_set_badblocks(
> -					    rdev,
> -					    r10_bio->devs[m].addr,
> -					    r10_bio->sectors, 0))
> -					md_error(conf->mddev, rdev);
> -			}
> +			else
> +				rdev_set_badblocks(rdev,
> +						   r10_bio->devs[m].addr,
> +						   r10_bio->sectors, 0);
>   		}
>   		put_buf(r10_bio);
>   	} else {
> @@ -2942,8 +2936,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
>   				rdev_dec_pending(rdev, conf->mddev);
>   			} else if (bio != NULL && bio->bi_status) {
>   				fail = true;
> -				if (!narrow_write_error(r10_bio, m))
> -					md_error(conf->mddev, rdev);
> +				narrow_write_error(r10_bio, m);
>   				rdev_dec_pending(rdev, conf->mddev);
>   			}
>   			bio = r10_bio->devs[m].repl_bio;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index b09265fb872a..70106abf2110 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -2817,11 +2817,9 @@ static void raid5_end_read_request(struct bio * bi)
>   		else {
>   			clear_bit(R5_ReadError, &sh->dev[i].flags);
>   			clear_bit(R5_ReWrite, &sh->dev[i].flags);
> -			if (!(set_bad
> -			      && test_bit(In_sync, &rdev->flags)
> -			      && rdev_set_badblocks(
> -				      rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
> -				md_error(conf->mddev, rdev);
> +			if (!(set_bad && test_bit(In_sync, &rdev->flags)))
> +				rdev_set_badblocks(rdev, sh->sector,
> +						   RAID5_STRIPE_SECTORS(conf), 0);
>   		}
>   	}
>   	rdev_dec_pending(rdev, conf->mddev);
> @@ -3599,11 +3597,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>   			else
>   				rdev = NULL;
>   			if (rdev) {
> -				if (!rdev_set_badblocks(
> -					    rdev,
> -					    sh->sector,
> -					    RAID5_STRIPE_SECTORS(conf), 0))
> -					md_error(conf->mddev, rdev);
> +				rdev_set_badblocks(rdev,
> +						   sh->sector,
> +						   RAID5_STRIPE_SECTORS(conf),
> +						   0);
>   				rdev_dec_pending(rdev, conf->mddev);
>   			}
>   		}
> @@ -5254,9 +5251,8 @@ static void handle_stripe(struct stripe_head *sh)
>   			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
>   				/* We own a safe reference to the rdev */
>   				rdev = conf->disks[i].rdev;
> -				if (!rdev_set_badblocks(rdev, sh->sector,
> -							RAID5_STRIPE_SECTORS(conf), 0))
> -					md_error(conf->mddev, rdev);
> +				rdev_set_badblocks(rdev, sh->sector,
> +						   RAID5_STRIPE_SECTORS(conf), 0);
>   				rdev_dec_pending(rdev, conf->mddev);
>   			}
>   			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {

-- 
Thanks,
Nan


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 3/7] md: cleanup MD_RECOVERY_ERROR flag
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
  2025-09-17  9:35 ` [PATCH 1/7] md: factor error handling out of md_done_sync into helper linan666
  2025-09-17  9:35 ` [PATCH 2/7] md: mark rdev Faulty when badblocks setting fails linan666
@ 2025-09-17  9:35 ` linan666
  2025-09-25  8:50   ` Yu Kuai
  2025-09-17  9:35 ` [PATCH 4/7] md: factor out sync completion update into helper linan666
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

The MD_RECOVERY_ERROR flag prevented bad sectors from updating
'resync_offset' when badblocks failed to set during sync errors.
Now that failure to set badblocks definitively marks the disk as
Faulty, this flag is redundant. In both scenarios (successful
badblock marking or disk fault), the bad sectors becomes unreadable
and its handling is considered completed.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.h |  2 --
 drivers/md/md.c | 22 ++++------------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index ba567b63afd3..07a22f3772d8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -644,8 +644,6 @@ enum recovery_flags {
 	MD_RECOVERY_FROZEN,
 	/* waiting for pers->start() to finish */
 	MD_RECOVERY_WAIT,
-	/* interrupted because io-error */
-	MD_RECOVERY_ERROR,
 
 	/* flags determines sync action, see details in enum sync_action */
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 05b6b3145648..c4d765d57af7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8949,7 +8949,6 @@ void md_sync_error(struct mddev *mddev)
 {
 	// stop recovery, signal do_sync ....
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 }
 EXPORT_SYMBOL(md_sync_error);
@@ -9598,7 +9597,6 @@ void md_do_sync(struct md_thread *thread)
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
 		mddev->curr_resync_completed = mddev->curr_resync;
 		sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9607,24 +9605,12 @@ void md_do_sync(struct md_thread *thread)
 
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
+		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			mddev->curr_resync = MaxSector;
+
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-				if (mddev->curr_resync >= mddev->resync_offset) {
-					pr_debug("md: checkpointing %s of %s.\n",
-						 desc, mdname(mddev));
-					if (test_bit(MD_RECOVERY_ERROR,
-						&mddev->recovery))
-						mddev->resync_offset =
-							mddev->curr_resync_completed;
-					else
-						mddev->resync_offset =
-							mddev->curr_resync;
-				}
-			} else
-				mddev->resync_offset = MaxSector;
+			mddev->resync_offset = mddev->curr_resync;
 		} else {
-			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-				mddev->curr_resync = MaxSector;
 			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
 				rcu_read_lock();
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/7] md: cleanup MD_RECOVERY_ERROR flag
  2025-09-17  9:35 ` [PATCH 3/7] md: cleanup MD_RECOVERY_ERROR flag linan666
@ 2025-09-25  8:50   ` Yu Kuai
  0 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2025-09-25  8:50 UTC (permalink / raw)
  To: linan666, song, neil, namhyung
  Cc: linux-raid, linux-kernel, yangerkun, yi.zhang, yukuai (C)

Hi,

在 2025/09/17 17:35, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
> 
> The MD_RECOVERY_ERROR flag prevented bad sectors from updating
> 'resync_offset' when badblocks failed to set during sync errors.
> Now that failure to set badblocks definitively marks the disk as
> Faulty, this flag is redundant. In both scenarios (successful
> badblock marking or disk fault), the bad sectors becomes unreadable
> and its handling is considered completed.
> 
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/md.h |  2 --
>   drivers/md/md.c | 22 ++++------------------
>   2 files changed, 4 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index ba567b63afd3..07a22f3772d8 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -644,8 +644,6 @@ enum recovery_flags {
>   	MD_RECOVERY_FROZEN,
>   	/* waiting for pers->start() to finish */
>   	MD_RECOVERY_WAIT,
> -	/* interrupted because io-error */
> -	MD_RECOVERY_ERROR,
>   
>   	/* flags determines sync action, see details in enum sync_action */
>   
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 05b6b3145648..c4d765d57af7 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8949,7 +8949,6 @@ void md_sync_error(struct mddev *mddev)
>   {
>   	// stop recovery, signal do_sync ....
>   	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> -	set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
>   	md_wakeup_thread(mddev->thread);
>   }
>   EXPORT_SYMBOL(md_sync_error);
> @@ -9598,7 +9597,6 @@ void md_do_sync(struct md_thread *thread)
>   	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
>   
>   	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> -	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&

Not sure why this is removed, otherwise LGTM.

Thanks,
Kuai

>   	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
>   		mddev->curr_resync_completed = mddev->curr_resync;
>   		sysfs_notify_dirent_safe(mddev->sysfs_completed);
> @@ -9607,24 +9605,12 @@ void md_do_sync(struct md_thread *thread)
>   
>   	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
>   	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
> +		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> +			mddev->curr_resync = MaxSector;
> +
>   		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
> -			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
> -				if (mddev->curr_resync >= mddev->resync_offset) {
> -					pr_debug("md: checkpointing %s of %s.\n",
> -						 desc, mdname(mddev));
> -					if (test_bit(MD_RECOVERY_ERROR,
> -						&mddev->recovery))
> -						mddev->resync_offset =
> -							mddev->curr_resync_completed;
> -					else
> -						mddev->resync_offset =
> -							mddev->curr_resync;
> -				}
> -			} else
> -				mddev->resync_offset = MaxSector;
> +			mddev->resync_offset = mddev->curr_resync;
>   		} else {
> -			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> -				mddev->curr_resync = MaxSector;
>   			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>   			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
>   				rcu_read_lock();
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 4/7] md: factor out sync completion update into helper
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
                   ` (2 preceding siblings ...)
  2025-09-17  9:35 ` [PATCH 3/7] md: cleanup MD_RECOVERY_ERROR flag linan666
@ 2025-09-17  9:35 ` linan666
  2025-09-25  9:00   ` Yu Kuai
  2025-09-17  9:35 ` [PATCH 5/7] md/raid10: fix any_working flag handling in raid10_sync_request linan666
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

Repeatedly reading 'mddev->recovery' flags in md_do_sync() may introduce
potential risk if this flag is modified during sync, leading to incorrect
offset updates. Therefore, replace direct 'mddev->recovery' checks with
'action'.

Move sync completion update logic into helper md_finish_sync(), which
improves readability and maintainability.

The reshape completion update remains safe as it only updated after
successful reshape when MD_RECOVERY_INTR is not set and 'curr_resync'
equals 'max_sectors'.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.c | 82 ++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c4d765d57af7..f4f80d32db98 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9301,6 +9301,51 @@ static bool sync_io_within_limit(struct mddev *mddev)
 	       (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
 }
 
+/*
+ * Update sync offset and mddev status when sync completes
+ */
+static void md_finish_sync(struct mddev *mddev, enum sync_action action)
+{
+	struct md_rdev *rdev;
+
+	switch (action) {
+	case ACTION_RESYNC:
+	case ACTION_REPAIR:
+		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			mddev->curr_resync = MaxSector;
+		mddev->resync_offset = mddev->curr_resync;
+		break;
+	case ACTION_RECOVER:
+		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			mddev->curr_resync = MaxSector;
+		rcu_read_lock();
+		rdev_for_each_rcu(rdev, mddev)
+			if (mddev->delta_disks >= 0 &&
+			    rdev_needs_recovery(rdev, mddev->curr_resync))
+				rdev->recovery_offset = mddev->curr_resync;
+		rcu_read_unlock();
+		break;
+	case ACTION_RESHAPE:
+		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+		    mddev->delta_disks > 0 &&
+		    mddev->pers->finish_reshape &&
+		    mddev->pers->size &&
+		    !mddev_is_dm(mddev)) {
+			mddev_lock_nointr(mddev);
+			md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
+			mddev_unlock(mddev);
+			if (!mddev_is_clustered(mddev))
+				set_capacity_and_notify(mddev->gendisk,
+							mddev->array_sectors);
+		}
+		break;
+	/* */
+	case ACTION_CHECK:
+	default:
+		break;
+	}
+}
+
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
 #define UPDATE_FREQUENCY (5*60*HZ)
@@ -9316,7 +9361,6 @@ void md_do_sync(struct md_thread *thread)
 	int last_mark,m;
 	sector_t last_check;
 	int skipped = 0;
-	struct md_rdev *rdev;
 	enum sync_action action;
 	const char *desc;
 	struct blk_plug plug;
@@ -9603,46 +9647,14 @@ void md_do_sync(struct md_thread *thread)
 	}
 	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
 
-	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
-		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-			mddev->curr_resync = MaxSector;
-
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-			mddev->resync_offset = mddev->curr_resync;
-		} else {
-			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
-				rcu_read_lock();
-				rdev_for_each_rcu(rdev, mddev)
-					if (mddev->delta_disks >= 0 &&
-					    rdev_needs_recovery(rdev, mddev->curr_resync))
-						rdev->recovery_offset = mddev->curr_resync;
-				rcu_read_unlock();
-			}
-		}
-	}
+	if (mddev->curr_resync > MD_RESYNC_ACTIVE)
+		md_finish_sync(mddev, action);
  skip:
 	/* set CHANGE_PENDING here since maybe another update is needed,
 	 * so other nodes are informed. It should be harmless for normal
 	 * raid */
 	set_mask_bits(&mddev->sb_flags, 0,
 		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
-
-	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-			mddev->delta_disks > 0 &&
-			mddev->pers->finish_reshape &&
-			mddev->pers->size &&
-			!mddev_is_dm(mddev)) {
-		mddev_lock_nointr(mddev);
-		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
-		mddev_unlock(mddev);
-		if (!mddev_is_clustered(mddev))
-			set_capacity_and_notify(mddev->gendisk,
-						mddev->array_sectors);
-	}
-
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 		/* We completed so min/max setting can be forgotten if used. */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/7] md: factor out sync completion update into helper
  2025-09-17  9:35 ` [PATCH 4/7] md: factor out sync completion update into helper linan666
@ 2025-09-25  9:00   ` Yu Kuai
  0 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2025-09-25  9:00 UTC (permalink / raw)
  To: linan666, song, neil, namhyung
  Cc: linux-raid, linux-kernel, yangerkun, yi.zhang, yukuai (C)

Hi,

在 2025/09/17 17:35, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
> 
> Repeatedly reading 'mddev->recovery' flags in md_do_sync() may introduce
> potential risk if this flag is modified during sync, leading to incorrect
> offset updates. Therefore, replace direct 'mddev->recovery' checks with
> 'action'.
> 
> Move sync completion update logic into helper md_finish_sync(), which
> improves readability and maintainability.
> 
> The reshape completion update remains safe as it only updated after
> successful reshape when MD_RECOVERY_INTR is not set and 'curr_resync'
> equals 'max_sectors'.
> 
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/md.c | 82 ++++++++++++++++++++++++++++---------------------
>   1 file changed, 47 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index c4d765d57af7..f4f80d32db98 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -9301,6 +9301,51 @@ static bool sync_io_within_limit(struct mddev *mddev)
>   	       (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
>   }
>   
> +/*
> + * Update sync offset and mddev status when sync completes
> + */
> +static void md_finish_sync(struct mddev *mddev, enum sync_action action)
> +{
> +	struct md_rdev *rdev;
> +
> +	switch (action) {
> +	case ACTION_RESYNC:
> +	case ACTION_REPAIR:
> +		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> +			mddev->curr_resync = MaxSector;
> +		mddev->resync_offset = mddev->curr_resync;
> +		break;
> +	case ACTION_RECOVER:
> +		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> +			mddev->curr_resync = MaxSector;
> +		rcu_read_lock();
> +		rdev_for_each_rcu(rdev, mddev)
> +			if (mddev->delta_disks >= 0 &&
> +			    rdev_needs_recovery(rdev, mddev->curr_resync))
> +				rdev->recovery_offset = mddev->curr_resync;
> +		rcu_read_unlock();
> +		break;
> +	case ACTION_RESHAPE:
> +		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
> +		    mddev->delta_disks > 0 &&
> +		    mddev->pers->finish_reshape &&
> +		    mddev->pers->size &&
> +		    !mddev_is_dm(mddev)) {
> +			mddev_lock_nointr(mddev);
> +			md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
> +			mddev_unlock(mddev);
> +			if (!mddev_is_clustered(mddev))
> +				set_capacity_and_notify(mddev->gendisk,
> +							mddev->array_sectors);
> +		}
> +		break;

I think pers->finish_reshape() can moved from md_reap_sync_thread() to
here as well. Otherwise LGTM.

Thanks,
Kuai

> +	/* */
> +	case ACTION_CHECK:
> +	default:
> +		break;
> +	}
> +}
> +
>   #define SYNC_MARKS	10
>   #define	SYNC_MARK_STEP	(3*HZ)
>   #define UPDATE_FREQUENCY (5*60*HZ)
> @@ -9316,7 +9361,6 @@ void md_do_sync(struct md_thread *thread)
>   	int last_mark,m;
>   	sector_t last_check;
>   	int skipped = 0;
> -	struct md_rdev *rdev;
>   	enum sync_action action;
>   	const char *desc;
>   	struct blk_plug plug;
> @@ -9603,46 +9647,14 @@ void md_do_sync(struct md_thread *thread)
>   	}
>   	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
>   
> -	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
> -	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
> -		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
> -			mddev->curr_resync = MaxSector;
> -
> -		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
> -			mddev->resync_offset = mddev->curr_resync;
> -		} else {
> -			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> -			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
> -				rcu_read_lock();
> -				rdev_for_each_rcu(rdev, mddev)
> -					if (mddev->delta_disks >= 0 &&
> -					    rdev_needs_recovery(rdev, mddev->curr_resync))
> -						rdev->recovery_offset = mddev->curr_resync;
> -				rcu_read_unlock();
> -			}
> -		}
> -	}
> +	if (mddev->curr_resync > MD_RESYNC_ACTIVE)
> +		md_finish_sync(mddev, action);
>    skip:
>   	/* set CHANGE_PENDING here since maybe another update is needed,
>   	 * so other nodes are informed. It should be harmless for normal
>   	 * raid */
>   	set_mask_bits(&mddev->sb_flags, 0,
>   		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
> -
> -	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> -			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
> -			mddev->delta_disks > 0 &&
> -			mddev->pers->finish_reshape &&
> -			mddev->pers->size &&
> -			!mddev_is_dm(mddev)) {
> -		mddev_lock_nointr(mddev);
> -		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
> -		mddev_unlock(mddev);
> -		if (!mddev_is_clustered(mddev))
> -			set_capacity_and_notify(mddev->gendisk,
> -						mddev->array_sectors);
> -	}
> -
>   	spin_lock(&mddev->lock);
>   	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
>   		/* We completed so min/max setting can be forgotten if used. */
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 5/7] md/raid10: fix any_working flag handling in raid10_sync_request
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
                   ` (3 preceding siblings ...)
  2025-09-17  9:35 ` [PATCH 4/7] md: factor out sync completion update into helper linan666
@ 2025-09-17  9:35 ` linan666
  2025-09-25  9:01   ` Yu Kuai
  2025-09-17  9:35 ` [PATCH 6/7] md/raid10: cleanup skip " linan666
  2025-09-17  9:35 ` [PATCH 7/7] md: remove recovery_disabled linan666
  6 siblings, 1 reply; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

In raid10_sync_request(), 'any_working' indicates if any IO will
be submitted. When there's only one In_sync disk with badblocks,
'any_working' might be set to 1 but no IO is submitted. Fix it by
setting 'any_working' after badblock checks.

Fixes: e875ecea266a ("md/raid10 record bad blocks as needed during recovery.")
Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4c58c32f7c27..377895087602 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3401,7 +3401,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				    !test_bit(In_sync, &rdev->flags))
 					continue;
 				/* This is where we read from */
-				any_working = 1;
 				sector = r10_bio->devs[j].addr;
 
 				if (is_badblock(rdev, sector, max_sync,
@@ -3416,6 +3415,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 						continue;
 					}
 				}
+				any_working = 1;
 				bio = r10_bio->devs[0].bio;
 				bio->bi_next = biolist;
 				biolist = bio;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/7] md/raid10: fix any_working flag handling in raid10_sync_request
  2025-09-17  9:35 ` [PATCH 5/7] md/raid10: fix any_working flag handling in raid10_sync_request linan666
@ 2025-09-25  9:01   ` Yu Kuai
  0 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2025-09-25  9:01 UTC (permalink / raw)
  To: linan666, song, neil, namhyung
  Cc: linux-raid, linux-kernel, yangerkun, yi.zhang, yukuai (C)

在 2025/09/17 17:35, linan666@huaweicloud.com 写道:
> From: Li Nan<linan122@huawei.com>
> 
> In raid10_sync_request(), 'any_working' indicates if any IO will
> be submitted. When there's only one In_sync disk with badblocks,
> 'any_working' might be set to 1 but no IO is submitted. Fix it by
> setting 'any_working' after badblock checks.
> 
> Fixes: e875ecea266a ("md/raid10 record bad blocks as needed during recovery.")
> Signed-off-by: Li Nan<linan122@huawei.com>
> ---
>   drivers/md/raid10.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)

LGTM
Reviewed-by: Yu Kuai <yukuai3@huawei.com>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 6/7] md/raid10: cleanup skip handling in raid10_sync_request
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
                   ` (4 preceding siblings ...)
  2025-09-17  9:35 ` [PATCH 5/7] md/raid10: fix any_working flag handling in raid10_sync_request linan666
@ 2025-09-17  9:35 ` linan666
  2025-09-17  9:35 ` [PATCH 7/7] md: remove recovery_disabled linan666
  6 siblings, 0 replies; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

Skip a sector in raid10_sync_request() when it needs no syncing or no
readable device exists. Current skip handling is unnecessary:

- Use 'skip' label to reissue the next sector instead of return directly
- Complete sync and return 'max_sectors' when multiple sectors are skipped
  due to badblocks

The first is error-prone. For example, commit bc49694a9e8f ("md: pass in
max_sectors for pers->sync_request()") removed redundant max_sector
assignments. Since skip modifies max_sectors, `goto skip` leaves
max_sectors equal to sector_nr after the jump, which is incorrect.

The second causes sync to complete erroneously when no actual sync occurs.
For recovery, recording badblocks and continue syncing subsequent sectors
is more suitable. For resync, just skip bad sectors and syncing subsequent
sectors.

Clean up complex and unnecessary skip code. Return immediately when a
sector should be skipped. Reduce code paths and lower regression risk.

Fixes: bc49694a9e8f ("md: pass in max_sectors for pers->sync_request()")
Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/raid10.c | 96 +++++++++++----------------------------------
 1 file changed, 22 insertions(+), 74 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 377895087602..0cd542b77842 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3167,11 +3167,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int i;
 	int max_sync;
 	sector_t sync_blocks;
-	sector_t sectors_skipped = 0;
-	int chunks_skipped = 0;
 	sector_t chunk_mask = conf->geo.chunk_mask;
 	int page_idx = 0;
-	int error_disk = -1;
 
 	/*
 	 * Allow skipping a full rebuild for incremental assembly
@@ -3192,7 +3189,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (init_resync(conf))
 			return 0;
 
- skipped:
 	if (sector_nr >= max_sector) {
 		conf->cluster_sync_low = 0;
 		conf->cluster_sync_high = 0;
@@ -3244,33 +3240,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			mddev->bitmap_ops->close_sync(mddev);
 		close_sync(conf);
 		*skipped = 1;
-		return sectors_skipped;
+		return 0;
 	}
 
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		return reshape_request(mddev, sector_nr, skipped);
 
-	if (chunks_skipped >= conf->geo.raid_disks) {
-		pr_err("md/raid10:%s: %s fails\n", mdname(mddev),
-			test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?  "resync" : "recovery");
-		if (error_disk >= 0 &&
-		    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-			/*
-			 * recovery fails, set mirrors.recovery_disabled,
-			 * device shouldn't be added to there.
-			 */
-			conf->mirrors[error_disk].recovery_disabled =
-						mddev->recovery_disabled;
-			return 0;
-		}
-		/*
-		 * if there has been nothing to do on any drive,
-		 * then there is nothing to do at all.
-		 */
-		*skipped = 1;
-		return (max_sector - sector_nr) + sectors_skipped;
-	}
-
 	if (max_sector > mddev->resync_max)
 		max_sector = mddev->resync_max; /* Don't do IO beyond here */
 
@@ -3353,7 +3328,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				/* yep, skip the sync_blocks here, but don't assume
 				 * that there will never be anything to do here
 				 */
-				chunks_skipped = -1;
 				continue;
 			}
 			if (mrdev)
@@ -3484,29 +3458,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 					for (k = 0; k < conf->copies; k++)
 						if (r10_bio->devs[k].devnum == i)
 							break;
-					if (mrdev && !test_bit(In_sync,
-						      &mrdev->flags)
-					    && !rdev_set_badblocks(
-						    mrdev,
-						    r10_bio->devs[k].addr,
-						    max_sync, 0))
-						any_working = 0;
-					if (mreplace &&
-					    !rdev_set_badblocks(
-						    mreplace,
-						    r10_bio->devs[k].addr,
-						    max_sync, 0))
-						any_working = 0;
-				}
-				if (!any_working)  {
-					if (!test_and_set_bit(MD_RECOVERY_INTR,
-							      &mddev->recovery))
-						pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
-						       mdname(mddev));
-					mirror->recovery_disabled
-						= mddev->recovery_disabled;
-				} else {
-					error_disk = i;
+					if (mrdev &&
+					    !test_bit(In_sync, &mrdev->flags))
+						rdev_set_badblocks(
+							mrdev,
+							r10_bio->devs[k].addr,
+							max_sync, 0);
+					if (mreplace)
+						rdev_set_badblocks(
+							mreplace,
+							r10_bio->devs[k].addr,
+							max_sync, 0);
+					pr_warn("md/raid10:%s: cannot recovery sector %llu + %d.\n",
+						mdname(mddev), r10_bio->devs[k].addr, max_sync);
 				}
 				put_buf(r10_bio);
 				if (rb2)
@@ -3547,7 +3511,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				rb2->master_bio = NULL;
 				put_buf(rb2);
 			}
-			goto giveup;
+			*skipped = 1;
+			return max_sync;
 		}
 	} else {
 		/* resync. Schedule a read for every block at this virt offset */
@@ -3571,7 +3536,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 						 &mddev->recovery)) {
 			/* We can skip this block */
 			*skipped = 1;
-			return sync_blocks + sectors_skipped;
+			return sync_blocks;
 		}
 		if (sync_blocks < max_sync)
 			max_sync = sync_blocks;
@@ -3663,8 +3628,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 						mddev);
 			}
 			put_buf(r10_bio);
-			biolist = NULL;
-			goto giveup;
+			*skipped = 1;
+			return max_sync;
 		}
 	}
 
@@ -3684,7 +3649,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
 				bio->bi_status = BLK_STS_RESOURCE;
 				bio_endio(bio);
-				goto giveup;
+				*skipped = 1;
+				return max_sync;
 			}
 		}
 		nr_sectors += len>>9;
@@ -3752,25 +3718,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 	}
 
-	if (sectors_skipped)
-		/* pretend they weren't skipped, it makes
-		 * no important difference in this case
-		 */
-		md_done_sync(mddev, sectors_skipped);
-
-	return sectors_skipped + nr_sectors;
- giveup:
-	/* There is nowhere to write, so all non-sync
-	 * drives must be failed or in resync, all drives
-	 * have a bad block, so try the next chunk...
-	 */
-	if (sector_nr + max_sync < max_sector)
-		max_sector = sector_nr + max_sync;
-
-	sectors_skipped += (max_sector - sector_nr);
-	chunks_skipped ++;
-	sector_nr = max_sector;
-	goto skipped;
+	return nr_sectors;
 }
 
 static sector_t
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 7/7] md: remove recovery_disabled
  2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
                   ` (5 preceding siblings ...)
  2025-09-17  9:35 ` [PATCH 6/7] md/raid10: cleanup skip " linan666
@ 2025-09-17  9:35 ` linan666
  6 siblings, 0 replies; 13+ messages in thread
From: linan666 @ 2025-09-17  9:35 UTC (permalink / raw)
  To: song, yukuai3, neil, namhyung
  Cc: linux-raid, linux-kernel, linan666, yangerkun, yi.zhang

From: Li Nan <linan122@huawei.com>

'recovery_disabled' logic is complex and confusing, originally intended to
preserve raid in extreme scenarios. It was used in following cases:
- When sync fails and setting badblocks also fails, kick out non-In_sync
  rdev and block spare rdev from joining to preserve raid [1]
- When last backup is unavailable, prevent repeated add-remove of spares
  triggering recovery [2]

The original issues are now resolved:
- Error handlers in all raid types prevent last rdev from being kicked out
- Disks with failed recovery are marked Faulty and can't re-join

Therefore, remove 'recovery_disabled' as it's no longer needed.

[1] 5389042ffa36 ("md: change managed of recovery_disabled.")
[2] 4044ba58dd15 ("md: don't retry recovery of raid1 that fails due to error on source drive.")

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.h     |  6 ------
 drivers/md/raid1.h  |  5 -----
 drivers/md/raid10.h |  5 -----
 drivers/md/raid5.h  |  1 -
 drivers/md/md.c     |  3 ---
 drivers/md/raid1.c  | 17 +++--------------
 drivers/md/raid10.c |  8 --------
 drivers/md/raid5.c  | 10 +---------
 8 files changed, 4 insertions(+), 51 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 07a22f3772d8..5fc9acb57447 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -493,12 +493,6 @@ struct mddev {
 	int				ok_start_degraded;
 
 	unsigned long			recovery;
-	/* If a RAID personality determines that recovery (of a particular
-	 * device) will fail due to a read error on the source device, it
-	 * takes a copy of this number and does not attempt recovery again
-	 * until this number changes.
-	 */
-	int				recovery_disabled;
 
 	int				in_sync;	/* know to not need resync */
 	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index d236ef179cfb..dfd996f2886f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -93,11 +93,6 @@ struct r1conf {
 	 */
 	int			fullsync;
 
-	/* When the same as mddev->recovery_disabled we don't allow
-	 * recovery to be attempted as we expect a read error.
-	 */
-	int			recovery_disabled;
-
 	mempool_t		*r1bio_pool;
 	mempool_t		r1buf_pool;
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3f16ad6904a9..78b7a11cddf7 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -18,11 +18,6 @@
 struct raid10_info {
 	struct md_rdev	*rdev, *replacement;
 	sector_t	head_position;
-	int		recovery_disabled;	/* matches
-						 * mddev->recovery_disabled
-						 * when we shouldn't try
-						 * recovering this device.
-						 */
 };
 
 struct r10conf {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index eafc6e9ed6ee..eff2bba9d76f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -640,7 +640,6 @@ struct r5conf {
 					    * (fresh device added).
 					    * Cleared when a sync completes.
 					    */
-	int			recovery_disabled;
 	/* per cpu variables */
 	struct raid5_percpu __percpu *percpu;
 	int scribble_disks;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f4f80d32db98..75d30dc40848 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2577,9 +2577,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	list_add_rcu(&rdev->same_set, &mddev->disks);
 	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
 
-	/* May as well allow recovery to be retried once */
-	mddev->recovery_disabled++;
-
 	return 0;
 
  fail:
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7238e9f35e5..f1d4a495520c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1772,7 +1772,6 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 		set_bit(MD_BROKEN, &mddev->flags);
 
 		if (!mddev->fail_last_dev) {
-			conf->recovery_disabled = mddev->recovery_disabled;
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			return;
 		}
@@ -1916,7 +1915,6 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk)
 
 	/* Only remove non-faulty devices if recovery is not possible. */
 	if (!test_bit(Faulty, &rdev->flags) &&
-	    rdev->mddev->recovery_disabled != conf->recovery_disabled &&
 	    rdev->mddev->degraded < conf->raid_disks)
 		return false;
 
@@ -1936,9 +1934,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (mddev->recovery_disabled == conf->recovery_disabled)
-		return -EBUSY;
-
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -2358,7 +2353,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 		 */
 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
 		    !fix_sync_read_error(r1_bio)) {
-			conf->recovery_disabled = mddev->recovery_disabled;
 			md_done_sync(mddev, r1_bio->sectors);
 			md_sync_error(mddev);
 			put_buf(r1_bio);
@@ -2961,16 +2955,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		*skipped = 1;
 		put_buf(r1_bio);
 
-		if (!ok) {
-			/* Cannot record the badblocks, so need to
+		if (!ok)
+			/* Cannot record the badblocks, md_error has set INTR,
 			 * abort the resync.
-			 * If there are multiple read targets, could just
-			 * fail the really bad ones ???
 			 */
-			conf->recovery_disabled = mddev->recovery_disabled;
-			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			return 0;
-		} else
+		else
 			return min_bad;
 
 	}
@@ -3157,7 +3147,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_barrier);
 
 	bio_list_init(&conf->pending_bio_list);
-	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0cd542b77842..2ef2f640a6f0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2136,8 +2136,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		mirror = first;
 	for ( ; mirror <= last ; mirror++) {
 		p = &conf->mirrors[mirror];
-		if (p->recovery_disabled == mddev->recovery_disabled)
-			continue;
 		if (p->rdev) {
 			if (test_bit(WantReplacement, &p->rdev->flags) &&
 			    p->replacement == NULL && repl_slot < 0)
@@ -2149,7 +2147,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		if (err)
 			return err;
 		p->head_position = 0;
-		p->recovery_disabled = mddev->recovery_disabled - 1;
 		rdev->raid_disk = mirror;
 		err = 0;
 		if (rdev->saved_raid_disk != mirror)
@@ -2202,7 +2199,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	 * is not possible.
 	 */
 	if (!test_bit(Faulty, &rdev->flags) &&
-	    mddev->recovery_disabled != p->recovery_disabled &&
 	    (!p->replacement || p->replacement == rdev) &&
 	    number < conf->geo.raid_disks &&
 	    enough(conf, -1)) {
@@ -2541,8 +2537,6 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 					pr_notice("md/raid10:%s: recovery aborted due to read error\n",
 						  mdname(mddev));
 
-					conf->mirrors[dw].recovery_disabled
-						= mddev->recovery_disabled;
 					set_bit(MD_RECOVERY_INTR,
 						&mddev->recovery);
 					break;
@@ -4079,8 +4073,6 @@ static int raid10_run(struct mddev *mddev)
 		    disk->replacement->saved_raid_disk < 0) {
 			conf->fullsync = 1;
 		}
-
-		disk->recovery_disabled = mddev->recovery_disabled - 1;
 	}
 
 	if (mddev->resync_offset != MaxSector)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 70106abf2110..8b85e0e319a6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2918,7 +2918,6 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
 	if (has_failed(conf)) {
 		set_bit(MD_BROKEN, &conf->mddev->flags);
-		conf->recovery_disabled = mddev->recovery_disabled;
 
 		pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
 			mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -3723,10 +3722,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	}
 	md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
 
-	if (abort) {
-		conf->recovery_disabled = conf->mddev->recovery_disabled;
+	if (abort)
 		md_sync_error(conf->mddev);
-	}
 }
 
 static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -7530,8 +7527,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	conf->bypass_threshold = BYPASS_THRESHOLD;
-	conf->recovery_disabled = mddev->recovery_disabled - 1;
-
 	conf->raid_disks = mddev->raid_disks;
 	if (mddev->reshape_position == MaxSector)
 		conf->previous_raid_disks = mddev->raid_disks;
@@ -8203,7 +8198,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	 * isn't possible.
 	 */
 	if (!test_bit(Faulty, &rdev->flags) &&
-	    mddev->recovery_disabled != conf->recovery_disabled &&
 	    !has_failed(conf) &&
 	    (!p->replacement || p->replacement == rdev) &&
 	    number < conf->raid_disks) {
@@ -8264,8 +8258,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 		return 0;
 	}
-	if (mddev->recovery_disabled == conf->recovery_disabled)
-		return -EBUSY;
 
 	if (rdev->saved_raid_disk < 0 && has_failed(conf))
 		/* no point adding a device */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-09-25  9:01 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-17  9:35 [PATCH 0/7] cleanup and bugfix of sync linan666
2025-09-17  9:35 ` [PATCH 1/7] md: factor error handling out of md_done_sync into helper linan666
2025-09-22  1:24   ` Yu Kuai
2025-09-17  9:35 ` [PATCH 2/7] md: mark rdev Faulty when badblocks setting fails linan666
2025-09-22  2:01   ` Li Nan
2025-09-17  9:35 ` [PATCH 3/7] md: cleanup MD_RECOVERY_ERROR flag linan666
2025-09-25  8:50   ` Yu Kuai
2025-09-17  9:35 ` [PATCH 4/7] md: factor out sync completion update into helper linan666
2025-09-25  9:00   ` Yu Kuai
2025-09-17  9:35 ` [PATCH 5/7] md/raid10: fix any_working flag handling in raid10_sync_request linan666
2025-09-25  9:01   ` Yu Kuai
2025-09-17  9:35 ` [PATCH 6/7] md/raid10: cleanup skip " linan666
2025-09-17  9:35 ` [PATCH 7/7] md: remove recovery_disabled linan666

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).