* [PATCH v6 1/3] md: suspend array when sync_action=reshape
2026-06-23 12:38 [PATCH v6 0/3] md/raid10: fix r10bio width mismatches across reshape Chen Cheng
@ 2026-06-23 12:38 ` Chen Cheng
2026-06-23 12:55 ` sashiko-bot
2026-06-23 12:38 ` [PATCH v6 2/3] md/raid10: resize r10bio_pool for reshape Chen Cheng
2026-06-23 12:38 ` [PATCH v6 3/3] md/raid10: free r10bio before ending master_bio in raid_end_bio_io() and raid_end_discard_bio() Chen Cheng
2 siblings, 1 reply; 6+ messages in thread
From: Chen Cheng @ 2026-06-23 12:38 UTC (permalink / raw)
To: linux-raid, yukuai, yukuai; +Cc: chencheng, linux-kernel
From: Chen Cheng <chencheng@fnnas.com>
raid10 needs to resize/swap r10bio_pool when reshape changes
raid_disks, and, don't let new requests keep allocating r10bio
objects from the old pool while that transition is in progress.
suspend and lock array before mddev_start_reshape(), and resume
it on exit.
Other sync_action ops are unchanged.
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
drivers/md/md.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 096bb64e87bd..1377c407614c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5261,25 +5261,29 @@ action_store(struct mddev *mddev, const char *page, size_t len)
enum sync_action action;
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
+ action = md_sync_action_by_name(page);
retry:
if (work_busy(&mddev->sync_work))
flush_work(&mddev->sync_work);
- ret = mddev_lock(mddev);
+ ret = (action == ACTION_RESHAPE) ?
+ mddev_suspend_and_lock(mddev) :
+ mddev_lock(mddev);
if (ret)
return ret;
if (work_busy(&mddev->sync_work)) {
- mddev_unlock(mddev);
+ if (action == ACTION_RESHAPE)
+ mddev_unlock_and_resume(mddev);
+ else
+ mddev_unlock(mddev);
goto retry;
}
- action = md_sync_action_by_name(page);
-
/* TODO: mdadm rely on "idle" to start sync_thread. */
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
switch (action) {
case ACTION_FROZEN:
md_frozen_sync_thread(mddev);
@@ -5344,11 +5348,14 @@ action_store(struct mddev *mddev, const char *page, size_t len)
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
ret = len;
out:
- mddev_unlock(mddev);
+ if (action == ACTION_RESHAPE)
+ mddev_unlock_and_resume(mddev);
+ else
+ mddev_unlock(mddev);
return ret;
}
static struct md_sysfs_entry md_scan_mode =
__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
--
2.54.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH v6 2/3] md/raid10: resize r10bio_pool for reshape
2026-06-23 12:38 [PATCH v6 0/3] md/raid10: fix r10bio width mismatches across reshape Chen Cheng
2026-06-23 12:38 ` [PATCH v6 1/3] md: suspend array when sync_action=reshape Chen Cheng
@ 2026-06-23 12:38 ` Chen Cheng
2026-06-23 13:00 ` sashiko-bot
2026-06-23 12:38 ` [PATCH v6 3/3] md/raid10: free r10bio before ending master_bio in raid_end_bio_io() and raid_end_discard_bio() Chen Cheng
2 siblings, 1 reply; 6+ messages in thread
From: Chen Cheng @ 2026-06-23 12:38 UTC (permalink / raw)
To: linux-raid, yukuai, yukuai; +Cc: chencheng, linux-kernel
From: Chen Cheng <chencheng@fnnas.com>
When reshape grows raid_disks, the pool must also switch to new geometry
object size , and allocate a new geometry size pool and replace the old.
But not for shrinking reshape, because regular I/O can still use the
prev geo for sectors that have not crossed reshape_progress yet.
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
drivers/md/raid10.c | 44 +++++++++++++++++++++++++++++++++-----------
drivers/md/raid10.h | 2 +-
2 files changed, 34 insertions(+), 12 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cee5a253a281..d740744a9746 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -101,14 +101,25 @@ static void end_reshape(struct r10conf *conf);
static inline struct r10bio *get_resync_r10bio(struct bio *bio)
{
return get_resync_pages(bio)->raid_bio;
}
-static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
+static inline int calc_r10bio_size(unsigned int raid_disks)
{
- struct r10conf *conf = data;
- int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
+ return offsetof(struct r10bio, devs[raid_disks]);
+}
+
+static mempool_t *create_r10bio_pool(unsigned int raid_disks)
+{
+ int size = calc_r10bio_size(raid_disks);
+
+ return mempool_create_kmalloc_pool(NR_RAID_BIOS, size);
+}
+
+static struct r10bio *alloc_r10bio(unsigned int raid_disks, gfp_t gfp_flags)
+{
+ int size = calc_r10bio_size(raid_disks);
/* allocate a r10bio with room for raid_disks entries in the
* bios array */
return kzalloc(size, gfp_flags);
}
@@ -135,11 +146,11 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
struct bio *bio;
int j;
int nalloc, nalloc_rp;
struct resync_pages *rps;
- r10_bio = r10bio_pool_alloc(gfp_flags, conf);
+ r10_bio = alloc_r10bio(conf->geo.raid_disks, gfp_flags);
if (!r10_bio)
return NULL;
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
@@ -275,11 +286,11 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
static void free_r10bio(struct r10bio *r10_bio)
{
struct r10conf *conf = r10_bio->mddev->private;
put_all_bios(conf, r10_bio);
- mempool_free(r10_bio, &conf->r10bio_pool);
+ mempool_free(r10_bio, conf->r10bio_pool);
}
static void put_buf(struct r10bio *r10_bio)
{
struct r10conf *conf = r10_bio->mddev->private;
@@ -1537,11 +1548,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
- r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
@@ -1729,11 +1740,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
last_stripe_index *= geo->far_copies;
end_disk_offset = (bio_end & geo->chunk_mask) +
(last_stripe_index << geo->chunk_shift);
retry_discard:
- r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->mddev = mddev;
r10_bio->state = 0;
r10_bio->sectors = 0;
r10_bio->read_slot = -1;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
@@ -3830,11 +3841,11 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
static void raid10_free_conf(struct r10conf *conf)
{
if (!conf)
return;
- mempool_exit(&conf->r10bio_pool);
+ mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
safe_put_page(conf->tmppage);
bioset_exit(&conf->bio_split);
@@ -3877,13 +3888,12 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->tmppage)
goto out;
conf->geo = geo;
conf->copies = copies;
- err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
- rbio_pool_free, conf);
- if (err)
+ conf->r10bio_pool = create_r10bio_pool(conf->geo.raid_disks);
+ if (!conf->r10bio_pool)
goto out;
err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
if (err)
goto out;
@@ -4373,10 +4383,11 @@ static int raid10_start_reshape(struct mddev *mddev)
struct geom new;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
int spares = 0;
int ret;
+ mempool_t *new_pool = NULL;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (setup_geo(&new, mddev, geo_start) != conf->copies)
@@ -4409,10 +4420,15 @@ static int raid10_start_reshape(struct mddev *mddev)
if (spares < mddev->delta_disks)
return -EINVAL;
conf->offset_diff = min_offset_diff;
+ if (mddev->delta_disks > 0) {
+ new_pool = create_r10bio_pool(new.raid_disks);
+ if (!new_pool)
+ return -ENOMEM;
+ }
spin_lock_irq(&conf->device_lock);
if (conf->mirrors_new) {
memcpy(conf->mirrors_new, conf->mirrors,
sizeof(struct raid10_info)*conf->prev.raid_disks);
smp_mb();
@@ -4509,10 +4525,14 @@ static int raid10_start_reshape(struct mddev *mddev)
mddev->degraded = calc_degraded(conf);
spin_unlock_irq(&conf->device_lock);
mddev->raid_disks = conf->geo.raid_disks;
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+ if (new_pool) {
+ mempool_destroy(conf->r10bio_pool);
+ conf->r10bio_pool = new_pool;
+ }
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -4531,10 +4551,12 @@ static int raid10_start_reshape(struct mddev *mddev)
smp_wmb();
conf->reshape_progress = MaxSector;
conf->reshape_safe = MaxSector;
mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
+ if (new_pool)
+ mempool_destroy(new_pool);
return ret;
}
/* Calculate the last device-address that could contain
* any block from the chunk that includes the array-address 's'
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index ec79d87fb92f..b711626a5db7 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -85,11 +85,11 @@ struct r10conf {
int have_replacement; /* There is at least one
* replacement device.
*/
wait_queue_head_t wait_barrier;
- mempool_t r10bio_pool;
+ mempool_t *r10bio_pool;
mempool_t r10buf_pool;
struct page *tmppage;
struct bio_set bio_split;
/* When taking over an array from a different personality, we store
--
2.54.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH v6 3/3] md/raid10: free r10bio before ending master_bio in raid_end_bio_io() and raid_end_discard_bio()
2026-06-23 12:38 [PATCH v6 0/3] md/raid10: fix r10bio width mismatches across reshape Chen Cheng
2026-06-23 12:38 ` [PATCH v6 1/3] md: suspend array when sync_action=reshape Chen Cheng
2026-06-23 12:38 ` [PATCH v6 2/3] md/raid10: resize r10bio_pool for reshape Chen Cheng
@ 2026-06-23 12:38 ` Chen Cheng
2 siblings, 0 replies; 6+ messages in thread
From: Chen Cheng @ 2026-06-23 12:38 UTC (permalink / raw)
To: linux-raid, yukuai, yukuai; +Cc: chencheng, linux-kernel
From: Chen Cheng <chencheng@fnnas.com>
origin flow:
bio_endio(master_bio); /* may drop active_io to zero */
allow_barrier(conf);
free_r10bio(r10_bio); /* reads conf->geo, returns to pool */
one scenario is:
CPU A (softirq, raid_end_bio_io) CPU B (action_store) --> reshape
================================ ===============================
bio_endio(master_bio)
md_end_clone_io
percpu_ref_put -> 0
wait_event wakeup, and,
mddev_suspend return
raid10_start_reshape:
setup_geo(&conf->geo, new)
...
mempool_destroy(old_pool)
conf->r10bio_pool = new_pool
allow_barrier(conf)
free_r10bio(r10_bio)
put_all_bios:
for (i=0; i<conf->geo.raid_disks; i++)
==> old obj, new geo, OOB
mempool_free(r10_bio, conf->r10bio_pool)
==> old-geometry obj freed into new pool
so .. fix by reorder the flow:
free_r10bio(r10_bio)
allow_barrier(conf)
bio_endio(master_io)
raid_end_discard_bio() is exactly the same.
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
drivers/md/raid10.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d740744a9746..e44a9b6e95c7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -330,24 +330,25 @@ static void reschedule_retry(struct r10bio *r10_bio)
*/
static void raid_end_bio_io(struct r10bio *r10_bio)
{
struct bio *bio = r10_bio->master_bio;
struct r10conf *conf = r10_bio->mddev->private;
+ unsigned long state = r10_bio->state;
- if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) {
- if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
- }
+ free_r10bio(r10_bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
*/
allow_barrier(conf);
- free_r10bio(r10_bio);
+ if (!test_and_set_bit(R10BIO_Returned, &state)) {
+ if (!test_bit(R10BIO_Uptodate, &state))
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ }
}
/*
* Update disk head position estimator based on IRQ completion info.
*/
@@ -1580,13 +1581,15 @@ static void raid_end_discard_bio(struct r10bio *r10bio)
if (!test_bit(R10BIO_Discard, &r10bio->state)) {
first_r10bio = (struct r10bio *)r10bio->master_bio;
free_r10bio(r10bio);
r10bio = first_r10bio;
} else {
+ struct bio *master_bio = r10bio->master_bio;
+
md_write_end(r10bio->mddev);
- bio_endio(r10bio->master_bio);
free_r10bio(r10bio);
+ bio_endio(master_bio);
break;
}
}
}
--
2.54.0
^ permalink raw reply related [flat|nested] 6+ messages in thread