* [PATCH v4 1/3] md: suspend array before raid10 reshape via sync_action
2026-06-03 3:59 [PATCH v4 0/3] md/raid10: fix r10bio width mismatches across reshape Chen Cheng
@ 2026-06-03 3:59 ` Chen Cheng
2026-06-03 3:59 ` [PATCH v4 2/3] md/raid10: make r10bio_pool use fixed-size objects Chen Cheng
2026-06-03 3:59 ` [PATCH v4 3/3] md/raid10: bound reused r10bio devs[] walks by used_nr_devs Chen Cheng
2 siblings, 0 replies; 4+ messages in thread
From: Chen Cheng @ 2026-06-03 3:59 UTC (permalink / raw)
To: linux-raid, yukuai; +Cc: chencheng, linux-kernel
From: Chen Cheng <chencheng@fnnas.com>
The sync_action=reshape path currently enters mddev_start_reshape() with
reconfig_mutex held but without suspending the array first. For raid10,
that means raid10_start_reshape() has to drop reconfig_mutex and reacquire
the array through mddev_suspend_and_lock_nointr() before it can safely
switch geometry-dependent state.
Use mddev_suspend_and_lock() for ACTION_RESHAPE in action_store(), so
the sysfs reshape path reaches mddev_start_reshape() with the array
already suspended and locked.
Other sync_action operations keep using mddev_lock() unchanged.
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
drivers/md/md.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 096bb64e87bd..5bc937e149ac 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5256,30 +5256,39 @@ static int mddev_start_reshape(struct mddev *mddev)
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
int ret;
+ bool suspended = false;
enum sync_action action;
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
+ action = md_sync_action_by_name(page);
retry:
if (work_busy(&mddev->sync_work))
flush_work(&mddev->sync_work);
- ret = mddev_lock(mddev);
+ if (action == ACTION_RESHAPE) {
+ ret = mddev_suspend_and_lock(mddev);
+ suspended = true;
+ } else {
+ ret = mddev_lock(mddev);
+ suspended = false;
+ }
if (ret)
return ret;
if (work_busy(&mddev->sync_work)) {
- mddev_unlock(mddev);
+ if (suspended)
+ mddev_unlock_and_resume(mddev);
+ else
+ mddev_unlock(mddev);
goto retry;
}
- action = md_sync_action_by_name(page);
-
/* TODO: mdadm rely on "idle" to start sync_thread. */
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
switch (action) {
case ACTION_FROZEN:
md_frozen_sync_thread(mddev);
@@ -5344,11 +5353,14 @@ action_store(struct mddev *mddev, const char *page, size_t len)
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
ret = len;
out:
- mddev_unlock(mddev);
+ if (suspended)
+ mddev_unlock_and_resume(mddev);
+ else
+ mddev_unlock(mddev);
return ret;
}
static struct md_sysfs_entry md_scan_mode =
__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
--
2.54.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v4 2/3] md/raid10: make r10bio_pool use fixed-size objects
2026-06-03 3:59 [PATCH v4 0/3] md/raid10: fix r10bio width mismatches across reshape Chen Cheng
2026-06-03 3:59 ` [PATCH v4 1/3] md: suspend array before raid10 reshape via sync_action Chen Cheng
@ 2026-06-03 3:59 ` Chen Cheng
2026-06-03 3:59 ` [PATCH v4 3/3] md/raid10: bound reused r10bio devs[] walks by used_nr_devs Chen Cheng
2 siblings, 0 replies; 4+ messages in thread
From: Chen Cheng @ 2026-06-03 3:59 UTC (permalink / raw)
To: linux-raid, yukuai; +Cc: chencheng, linux-kernel
From: Chen Cheng <chencheng@fnnas.com>
raid10 currently sizes regular r10bio_pool objects from
conf->geo.raid_disks, which makes the mempool element width depend on
the current geometry.
That breaks across reshape. Regular r10bio objects are preallocated and
reused, so after a geometry change the pool may still contain objects
allocated for the old width. A later request under the new geometry can
then reuse an r10bio whose devs[] array is still sized for the previous
raid_disks value.
Fix this by backing r10bio_pool with a fixed-size kmalloc mempool sized
for the maximum width needed across the current reshape transition.
Apply the same sizing rule to standalone r10bio objects allocated from
r10buf_pool_alloc().
This removes the geometry-dependent allocation width from regular
r10bio_pool objects and prevents reshape from reusing pool entries that
are too small for the new layout.
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
drivers/md/raid10.c | 48 +++++++++++++++++++++++++++++++++------------
drivers/md/raid10.h | 2 +-
2 files changed, 36 insertions(+), 14 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cee5a253a281..5eca34432e63 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -101,17 +101,32 @@ static void end_reshape(struct r10conf *conf);
static inline struct r10bio *get_resync_r10bio(struct bio *bio)
{
return get_resync_pages(bio)->raid_bio;
}
-static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
+static inline unsigned int calc_r10bio_pool_disks(struct mddev *mddev)
{
- struct r10conf *conf = data;
- int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
+ /* If delta_disks < 0, use bigger r10bio->devs[] is ok. */
+ return mddev->raid_disks + max(0, mddev->delta_disks);
+}
+
+static inline int calc_r10bio_size(struct mddev *mddev)
+{
+ return offsetof(struct r10bio, devs[calc_r10bio_pool_disks(mddev)]);
+}
+
+static mempool_t *create_r10bio_pool(struct mddev *mddev)
+{
+ int size = calc_r10bio_size(mddev);
+
+ return mempool_create_kmalloc_pool(NR_RAID_BIOS, size);
+}
+
+static struct r10bio *alloc_r10bio(struct mddev *mddev, gfp_t gfp_flags)
+{
+ int size = calc_r10bio_size(mddev);
- /* allocate a r10bio with room for raid_disks entries in the
- * bios array */
return kzalloc(size, gfp_flags);
}
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
@@ -135,11 +150,11 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
struct bio *bio;
int j;
int nalloc, nalloc_rp;
struct resync_pages *rps;
- r10_bio = r10bio_pool_alloc(gfp_flags, conf);
+ r10_bio = alloc_r10bio(conf->mddev, gfp_flags);
if (!r10_bio)
return NULL;
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
@@ -275,11 +290,11 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
static void free_r10bio(struct r10bio *r10_bio)
{
struct r10conf *conf = r10_bio->mddev->private;
put_all_bios(conf, r10_bio);
- mempool_free(r10_bio, &conf->r10bio_pool);
+ mempool_free(r10_bio, conf->r10bio_pool);
}
static void put_buf(struct r10bio *r10_bio)
{
struct r10conf *conf = r10_bio->mddev->private;
@@ -1537,11 +1552,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
- r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
@@ -1729,11 +1744,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
last_stripe_index *= geo->far_copies;
end_disk_offset = (bio_end & geo->chunk_mask) +
(last_stripe_index << geo->chunk_shift);
retry_discard:
- r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->mddev = mddev;
r10_bio->state = 0;
r10_bio->sectors = 0;
r10_bio->read_slot = -1;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
@@ -3830,11 +3845,11 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
static void raid10_free_conf(struct r10conf *conf)
{
if (!conf)
return;
- mempool_exit(&conf->r10bio_pool);
+ mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
safe_put_page(conf->tmppage);
bioset_exit(&conf->bio_split);
@@ -3877,13 +3892,12 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->tmppage)
goto out;
conf->geo = geo;
conf->copies = copies;
- err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
- rbio_pool_free, conf);
- if (err)
+ conf->r10bio_pool = create_r10bio_pool(mddev);
+ if (!conf->r10bio_pool)
goto out;
err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
if (err)
goto out;
@@ -4373,10 +4387,11 @@ static int raid10_start_reshape(struct mddev *mddev)
struct geom new;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
int spares = 0;
int ret;
+ mempool_t *new_pool;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (setup_geo(&new, mddev, geo_start) != conf->copies)
@@ -4409,10 +4424,17 @@ static int raid10_start_reshape(struct mddev *mddev)
if (spares < mddev->delta_disks)
return -EINVAL;
conf->offset_diff = min_offset_diff;
+ if (mddev->delta_disks > 0) {
+ new_pool = create_r10bio_pool(mddev);
+ if (!new_pool)
+ return -ENOMEM;
+ mempool_destroy(conf->r10bio_pool);
+ conf->r10bio_pool = new_pool;
+ }
spin_lock_irq(&conf->device_lock);
if (conf->mirrors_new) {
memcpy(conf->mirrors_new, conf->mirrors,
sizeof(struct raid10_info)*conf->prev.raid_disks);
smp_mb();
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index ec79d87fb92f..b711626a5db7 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -85,11 +85,11 @@ struct r10conf {
int have_replacement; /* There is at least one
* replacement device.
*/
wait_queue_head_t wait_barrier;
- mempool_t r10bio_pool;
+ mempool_t *r10bio_pool;
mempool_t r10buf_pool;
struct page *tmppage;
struct bio_set bio_split;
/* When taking over an array from a different personality, we store
--
2.54.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v4 3/3] md/raid10: bound reused r10bio devs[] walks by used_nr_devs
2026-06-03 3:59 [PATCH v4 0/3] md/raid10: fix r10bio width mismatches across reshape Chen Cheng
2026-06-03 3:59 ` [PATCH v4 1/3] md: suspend array before raid10 reshape via sync_action Chen Cheng
2026-06-03 3:59 ` [PATCH v4 2/3] md/raid10: make r10bio_pool use fixed-size objects Chen Cheng
@ 2026-06-03 3:59 ` Chen Cheng
2 siblings, 0 replies; 4+ messages in thread
From: Chen Cheng @ 2026-06-03 3:59 UTC (permalink / raw)
To: linux-raid, yukuai; +Cc: chencheng, linux-kernel
From: Chen Cheng <chencheng@fnnas.com>
After reshape changes raid_disks, an in-flight r10bio from the old geometry
can still be completed or freed later. In that case, using the current
geometry to walk r10_bio->devs[] is unsafe. A failure was reproduced with a
simple write workload while reshaping a raid10 array from 4 disks to 5 disks.
e.g.:
mdadm -C /dev/md777 -l10 -n4 /dev/sda /dev/sdb /dev/sdc /dev/sdd
mkfs.ext4 /dev/md777
mount /dev/md777 /mnt/test
fsstress -d /mnt/test -n 24000 -p 8 -l 24 &
mdadm /dev/md777 --add /dev/sde
mdadm --grow /dev/md777 --raid-devices=5 \
--backup-file=/tmp/md-reshape-backup
the sequence above can trigger:
BUG: KASAN: slab-out-of-bounds in free_r10bio+0x1c4/0x260 [raid10]
Read of size 8 at addr ffff00008c2dfac8 by task ksoftirqd/0/15
free_r10bio
raid_end_bio_io
one_write_done
raid10_end_write_request
The buggy object was 200 bytes long, which matches an r10bio with space for
only four devs[] entries. However, put_all_bios() and find_bio_disk() walk
r10_bio->devs[] using the current conf->geo.raid_disks value. Once reshape
switches conf->geo.raid_disks from 4 to 5, an old 4-slot r10bio can be
completed or freed as if it had 5 slots, and the walk overruns devs[4]. The
same stale-width mismatch can also surface during a 5-disk to 4-disk reshape.
Track the number of valid devs[] entries in each reused r10bio with
used_nr_devs. Initialize it whenever an r10bio is prepared for regular I/O,
discard, or resync/recovery/reshape work, and use it to bound devs[] walks
in put_all_bios() and find_bio_disk().
Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
drivers/md/raid10.c | 8 ++++++--
drivers/md/raid10.h | 2 ++
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5eca34432e63..f134b93fd593 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -273,11 +273,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
{
int i;
- for (i = 0; i < conf->geo.raid_disks; i++) {
+ for (i = 0; i < r10_bio->used_nr_devs; i++) {
struct bio **bio = & r10_bio->devs[i].bio;
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
bio = &r10_bio->devs[i].repl_bio;
@@ -370,11 +370,11 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
struct bio *bio, int *slotp, int *replp)
{
int slot;
int repl = 0;
- for (slot = 0; slot < conf->geo.raid_disks; slot++) {
+ for (slot = 0; slot < r10_bio->used_nr_devs; slot++) {
if (r10_bio->devs[slot].bio == bio)
break;
if (r10_bio->devs[slot].repl_bio == bio) {
repl = 1;
break;
@@ -1561,10 +1561,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
+ r10_bio->used_nr_devs = conf->geo.raid_disks;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -1749,10 +1750,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->mddev = mddev;
r10_bio->state = 0;
r10_bio->sectors = 0;
r10_bio->read_slot = -1;
+ r10_bio->used_nr_devs = geo->raid_disks;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
wait_blocked_dev(mddev, r10_bio);
/*
* For far layout it needs more than one r10bio to cover all regions.
@@ -3083,10 +3085,12 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
nalloc = conf->copies; /* resync */
else
nalloc = 2; /* recovery */
+ r10bio->used_nr_devs = nalloc;
+
for (i = 0; i < nalloc; i++) {
bio = r10bio->devs[i].bio;
rp = bio->bi_private;
bio_reset(bio, NULL, 0);
bio->bi_private = rp;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index b711626a5db7..4751119f9770 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -125,10 +125,12 @@ struct r10bio {
struct bio *master_bio;
/*
* if the IO is in READ direction, then this is where we read
*/
int read_slot;
+ /* Used to bound devs[] walks when the object is reused. */
+ unsigned int used_nr_devs;
struct list_head retry_list;
/*
* if the IO is in WRITE direction, then multiple bios are used,
* one for each copy.
--
2.54.0
^ permalink raw reply related [flat|nested] 4+ messages in thread