* [PATCH 1/7] md/raid10: fix r10bio leak in raid10_write_request() error paths
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 2/7] md/raid1: handle atomic writes that require splitting Abd-Alrhman Masalkhi
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel, sashiko-bot
When raid10_write_request() fails because REQ_NOWAIT is set, the
allocated r10_bio is not freed before returning, resulting in a memory
leak. Free r10_bio before returning from the REQ_NOWAIT error paths.
Fixes: c9aa889b035f ("md: raid10 add nowait support")
Reported-by: sashiko-bot <sashiko-bot@kernel.org>
Closes: https://lore.kernel.org/linux-raid/20260613184042.BCEC01F000E9@smtp.kernel.org/
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid10.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0a3cfdd3f5df..bd322eccdc3f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1365,6 +1365,7 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
+ free_r10bio(r10_bio);
return false;
}
for (;;) {
@@ -1398,6 +1399,7 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
if (bio->bi_opf & REQ_NOWAIT) {
allow_barrier(conf);
bio_wouldblock_error(bio);
+ free_r10bio(r10_bio);
return false;
}
mddev_add_trace_msg(conf->mddev,
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 2/7] md/raid1: handle atomic writes that require splitting
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 1/7] md/raid10: fix r10bio leak in raid10_write_request() error paths Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
2026-06-23 8:11 ` John Garry
2026-06-23 7:24 ` [PATCH 3/7] md/raid10: " Abd-Alrhman Masalkhi
` (4 subsequent siblings)
6 siblings, 1 reply; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel
If a request already requires splitting when entering
raid1_write_request(), the current code allows it to proceed until it
eventually reaches the split path. Along the way, the bio may instead
fail due to other conditions and return a different status, even though
the request was invalid as an atomic write from the beginning.
Additionally, an otherwise valid atomic write may later require
splitting because bad blocks reduce the writable range or because
write-behind constraints reduce the maximum writable size. In these
cases, the bio currently completes with either EINVAL or ENOTSUPP,
whereas it should complete with EIO instead.
Fixes: f2a38abf5f1c ("md/raid1: Atomic write support")
Fixes: a4c55c902670 ("md/raid1: simplify raid1_write_request() error handling")
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid1.c | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 86d4f224ffb1..8386d37343a4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1511,9 +1511,15 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
bool write_behind = false;
bool nowait = bio->bi_opf & REQ_NOWAIT;
+ bool atomic = bio->bi_opf & REQ_ATOMIC;
bool is_discard = op_is_discard(bio->bi_opf);
sector_t sector = bio->bi_iter.bi_sector;
+ if (atomic && max_sectors != bio_sectors(bio)) {
+ bio_endio_status(bio, BLK_STS_INVAL);
+ return false;
+ }
+
if (mddev_is_clustered(mddev) &&
mddev->cluster_ops->area_resyncing(mddev, WRITE, sector,
bio_end_sector(bio))) {
@@ -1592,20 +1598,6 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
}
if (is_bad) {
int good_sectors;
-
- /*
- * We cannot atomically write this, so just
- * error in that case. It could be possible to
- * atomically write other mirrors, but the
- * complexity of supporting that is not worth
- * the benefit.
- */
- if (bio->bi_opf & REQ_ATOMIC) {
- bio->bi_status = BLK_STS_NOTSUPP;
- bio_endio(bio);
- goto err_dec_pending;
- }
-
good_sectors = first_bad - sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
@@ -1626,6 +1618,11 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
+ if (atomic) {
+ bio_io_error(bio);
+ goto err_dec_pending;
+ }
+
bio = bio_submit_split_bioset(bio, max_sectors,
&conf->bio_split);
if (!bio)
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH 2/7] md/raid1: handle atomic writes that require splitting
2026-06-23 7:24 ` [PATCH 2/7] md/raid1: handle atomic writes that require splitting Abd-Alrhman Masalkhi
@ 2026-06-23 8:11 ` John Garry
2026-06-23 8:58 ` Abd-Alrhman Masalkhi
0 siblings, 1 reply; 13+ messages in thread
From: John Garry @ 2026-06-23 8:11 UTC (permalink / raw)
To: Abd-Alrhman Masalkhi, song, yukuai, magiclinan, xiao, axboe,
martin.petersen
Cc: linux-raid, linux-kernel
On 23/06/2026 08:24, Abd-Alrhman Masalkhi wrote:
> If a request already requires splitting when entering
> raid1_write_request(), the current code allows it to proceed until it
> eventually reaches the split path.
The block layer should catch invalid atomic writes in
submit_bio_noacct() -> blk_validate_atomic_write_op_size() before we
even get as far as the md atomic write handling. Having the check in
bio_submit_split_bioset() is really just a fail-safe for the block layer
not catching invalid atomic writes or the atomic writes queue limits not
being properly calculated.
> Along the way, the bio may instead
> fail due to other conditions and return a different status, even though
> the request was invalid as an atomic write from the beginning.
>
> Additionally, an otherwise valid atomic write may later require
> splitting because bad blocks reduce the writable range or because
> write-behind constraints reduce the maximum writable size. In these
> cases, the bio currently completes with either EINVAL or ENOTSUPP,
> whereas it should complete with EIO instead.
>
> Fixes: f2a38abf5f1c ("md/raid1: Atomic write support")
> Fixes: a4c55c902670 ("md/raid1: simplify raid1_write_request() error handling")
> Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
> ---
> drivers/md/raid1.c | 25 +++++++++++--------------
> 1 file changed, 11 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 86d4f224ffb1..8386d37343a4 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1511,9 +1511,15 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
> int first_clone;
> bool write_behind = false;
> bool nowait = bio->bi_opf & REQ_NOWAIT;
> + bool atomic = bio->bi_opf & REQ_ATOMIC;
> bool is_discard = op_is_discard(bio->bi_opf);
> sector_t sector = bio->bi_iter.bi_sector;
>
> + if (atomic && max_sectors != bio_sectors(bio)) {
> + bio_endio_status(bio, BLK_STS_INVAL);
> + return false;
> + }
> +
> if (mddev_is_clustered(mddev) &&
> mddev->cluster_ops->area_resyncing(mddev, WRITE, sector,
> bio_end_sector(bio))) {
> @@ -1592,20 +1598,6 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
> }
> if (is_bad) {
> int good_sectors;
> -
> - /*
> - * We cannot atomically write this, so just
> - * error in that case. It could be possible to
> - * atomically write other mirrors, but the
> - * complexity of supporting that is not worth
> - * the benefit.
> - */
> - if (bio->bi_opf & REQ_ATOMIC) {
> - bio->bi_status = BLK_STS_NOTSUPP;
what baseline are you using here? This looks different to linux-next 22
june and linus' master branch
> - bio_endio(bio);
> - goto err_dec_pending;
> - }
> -
> good_sectors = first_bad - sector;
> if (good_sectors < max_sectors)
> max_sectors = good_sectors;
> @@ -1626,6 +1618,11 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
> max_sectors = min_t(int, max_sectors,
> BIO_MAX_VECS * (PAGE_SIZE >> 9));
> if (max_sectors < bio_sectors(bio)) {
> + if (atomic) {
> + bio_io_error(bio);
> + goto err_dec_pending;
> + }
> +
> bio = bio_submit_split_bioset(bio, max_sectors,
> &conf->bio_split);
> if (!bio)
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH 2/7] md/raid1: handle atomic writes that require splitting
2026-06-23 8:11 ` John Garry
@ 2026-06-23 8:58 ` Abd-Alrhman Masalkhi
2026-06-23 9:20 ` John Garry
0 siblings, 1 reply; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 8:58 UTC (permalink / raw)
To: John Garry, song, yukuai, magiclinan, xiao, axboe,
martin.petersen
Cc: linux-raid, linux-kernel
On Tue, Jun 23, 2026 at 09:11 +0100, John Garry wrote:
> On 23/06/2026 08:24, Abd-Alrhman Masalkhi wrote:
>> If a request already requires splitting when entering
>> raid1_write_request(), the current code allows it to proceed until it
>> eventually reaches the split path.
>
> The block layer should catch invalid atomic writes in
> submit_bio_noacct() -> blk_validate_atomic_write_op_size() before we
> even get as far as the md atomic write handling. Having the check in
> bio_submit_split_bioset() is really just a fail-safe for the block layer
> not catching invalid atomic writes or the atomic writes queue limits not
> being properly calculated.
The request size itself satisfies the currently advertised atomic write
limits, so blk_validate_atomic_write_op_size() allows it. The problem
is that RAID1 may further restrict atomic writes to a single barrier
unit via align_to_barrier_unit_end(). Therefore a request that crosses
a barrier-unit boundary can still reach raid1_write_request() with
max_sectors < bio_sectors(bio).
If the barrier-unit restriction should instead be advertised through the
atomic write queue limits, then I agree the block layer could reject
such requests earlier and the RAID1 entry check would become
unnecessary.
However, there are also cases where max_sectors is reduced later within
raid1_write_request(), for example when bad blocks are present on some
mirrors (or due to other RAID1-specific constraints such as write-behind
limits). Those reductions depend on RAID1 runtime state and mirror
health, so they are not readily visible to the block layer during atomic
write validation. In those cases RAID1 still needs to detect that the
atomic write can no longer be serviced as requested and fail it
appropriately.
>
>> Along the way, the bio may instead
>> fail due to other conditions and return a different status, even though
>> the request was invalid as an atomic write from the beginning.
>>
>> Additionally, an otherwise valid atomic write may later require
>> splitting because bad blocks reduce the writable range or because
>> write-behind constraints reduce the maximum writable size. In these
>> cases, the bio currently completes with either EINVAL or ENOTSUPP,
>> whereas it should complete with EIO instead.
>>
>> Fixes: f2a38abf5f1c ("md/raid1: Atomic write support")
>> Fixes: a4c55c902670 ("md/raid1: simplify raid1_write_request() error handling")
>> Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
>> ---
>> drivers/md/raid1.c | 25 +++++++++++--------------
>> 1 file changed, 11 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 86d4f224ffb1..8386d37343a4 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1511,9 +1511,15 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
>> int first_clone;
>> bool write_behind = false;
>> bool nowait = bio->bi_opf & REQ_NOWAIT;
>> + bool atomic = bio->bi_opf & REQ_ATOMIC;
>> bool is_discard = op_is_discard(bio->bi_opf);
>> sector_t sector = bio->bi_iter.bi_sector;
>>
>> + if (atomic && max_sectors != bio_sectors(bio)) {
>> + bio_endio_status(bio, BLK_STS_INVAL);
>> + return false;
>> + }
>> +
>> if (mddev_is_clustered(mddev) &&
>> mddev->cluster_ops->area_resyncing(mddev, WRITE, sector,
>> bio_end_sector(bio))) {
>> @@ -1592,20 +1598,6 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
>> }
>> if (is_bad) {
>> int good_sectors;
>> -
>> - /*
>> - * We cannot atomically write this, so just
>> - * error in that case. It could be possible to
>> - * atomically write other mirrors, but the
>> - * complexity of supporting that is not worth
>> - * the benefit.
>> - */
>> - if (bio->bi_opf & REQ_ATOMIC) {
>> - bio->bi_status = BLK_STS_NOTSUPP;
>
> what baseline are you using here? This looks different to linux-next 22
> june and linus' master branch
>
I'm basing this series on Song's md tree, specifically the md-7.2
branch.
>> - bio_endio(bio);
>> - goto err_dec_pending;
>> - }
>> -
>> good_sectors = first_bad - sector;
>> if (good_sectors < max_sectors)
>> max_sectors = good_sectors;
>> @@ -1626,6 +1618,11 @@ static bool raid1_write_request(struct mddev *mddev, struct bio *bio,
>> max_sectors = min_t(int, max_sectors,
>> BIO_MAX_VECS * (PAGE_SIZE >> 9));
>> if (max_sectors < bio_sectors(bio)) {
>> + if (atomic) {
>> + bio_io_error(bio);
>> + goto err_dec_pending;
>> + }
>> +
>> bio = bio_submit_split_bioset(bio, max_sectors,
>> &conf->bio_split);
>> if (!bio)
>
--
Best Regards,
Abd-Alrhman
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH 2/7] md/raid1: handle atomic writes that require splitting
2026-06-23 8:58 ` Abd-Alrhman Masalkhi
@ 2026-06-23 9:20 ` John Garry
2026-06-23 10:06 ` Abd-Alrhman Masalkhi
0 siblings, 1 reply; 13+ messages in thread
From: John Garry @ 2026-06-23 9:20 UTC (permalink / raw)
To: Abd-Alrhman Masalkhi, song, yukuai, magiclinan, xiao, axboe,
martin.petersen
Cc: linux-raid, linux-kernel
On 23/06/2026 09:58, Abd-Alrhman Masalkhi wrote:
> On Tue, Jun 23, 2026 at 09:11 +0100, John Garry wrote:
>> On 23/06/2026 08:24, Abd-Alrhman Masalkhi wrote:
>>> If a request already requires splitting when entering
>>> raid1_write_request(), the current code allows it to proceed until it
>>> eventually reaches the split path.
>>
>> The block layer should catch invalid atomic writes in
>> submit_bio_noacct() -> blk_validate_atomic_write_op_size() before we
>> even get as far as the md atomic write handling. Having the check in
>> bio_submit_split_bioset() is really just a fail-safe for the block layer
>> not catching invalid atomic writes or the atomic writes queue limits not
>> being properly calculated.
> The request size itself satisfies the currently advertised atomic write
> limits, so blk_validate_atomic_write_op_size() allows it. The problem
> is that RAID1 may further restrict atomic writes to a single barrier
> unit via align_to_barrier_unit_end(). Therefore a request that crosses
> a barrier-unit boundary can still reach raid1_write_request() with
> max_sectors < bio_sectors(bio).
>
> If the barrier-unit restriction should instead be advertised through the
> atomic write queue limits,
It should. Any restrictions should be advertised up front. For the user
to issue an atomic write which is valid according to limits, then it
should succeed.
> then I agree the block layer could reject
> such requests earlier and the RAID1 entry check would become
> unnecessary.
>
> However, there are also cases where max_sectors is reduced later within
> raid1_write_request(), for example when bad blocks are present on some
> mirrors (or due to other RAID1-specific constraints such as write-behind
> limits). Those reductions depend on RAID1 runtime state and mirror
> health, so they are not readily visible to the block layer during atomic
> write validation. In those cases RAID1 still needs to detect that the
> atomic write can no longer be serviced as requested and fail it
> appropriately.
Sure, and we do this. As I remember, we should return -EIO in this case.
>
>>
>>> Along the way, the bio may instead
>>> fail due to other conditions and return a different status, even though
>>> the request was invalid as an atomic write from the beginning.
>>>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/7] md/raid1: handle atomic writes that require splitting
2026-06-23 9:20 ` John Garry
@ 2026-06-23 10:06 ` Abd-Alrhman Masalkhi
2026-06-23 11:38 ` John Garry
0 siblings, 1 reply; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 10:06 UTC (permalink / raw)
To: John Garry, song, yukuai, magiclinan, xiao, axboe,
martin.petersen
Cc: linux-raid, linux-kernel
On Tue, Jun 23, 2026 at 10:20 +0100, John Garry wrote:
> On 23/06/2026 09:58, Abd-Alrhman Masalkhi wrote:
>> On Tue, Jun 23, 2026 at 09:11 +0100, John Garry wrote:
>>> On 23/06/2026 08:24, Abd-Alrhman Masalkhi wrote:
>>>> If a request already requires splitting when entering
>>>> raid1_write_request(), the current code allows it to proceed until it
>>>> eventually reaches the split path.
>>>
>>> The block layer should catch invalid atomic writes in
>>> submit_bio_noacct() -> blk_validate_atomic_write_op_size() before we
>>> even get as far as the md atomic write handling. Having the check in
>>> bio_submit_split_bioset() is really just a fail-safe for the block layer
>>> not catching invalid atomic writes or the atomic writes queue limits not
>>> being properly calculated.
>> The request size itself satisfies the currently advertised atomic write
>> limits, so blk_validate_atomic_write_op_size() allows it. The problem
>> is that RAID1 may further restrict atomic writes to a single barrier
>> unit via align_to_barrier_unit_end(). Therefore a request that crosses
>> a barrier-unit boundary can still reach raid1_write_request() with
>> max_sectors < bio_sectors(bio).
>>
>> If the barrier-unit restriction should instead be advertised through the
>> atomic write queue limits,
>
> It should. Any restrictions should be advertised up front. For the user
> to issue an atomic write which is valid according to limits, then it
> should succeed.
>
I'll take a look at how best to expose that through the queue limits and
rework this part accordingly. If there is already an existing mechanism
you had in mind, I'd appreciate any pointers.
>> then I agree the block layer could reject
>> such requests earlier and the RAID1 entry check would become
>> unnecessary.
>>
>> However, there are also cases where max_sectors is reduced later within
>> raid1_write_request(), for example when bad blocks are present on some
>> mirrors (or due to other RAID1-specific constraints such as write-behind
>> limits). Those reductions depend on RAID1 runtime state and mirror
>> health, so they are not readily visible to the block layer during atomic
>> write validation. In those cases RAID1 still needs to detect that the
>> atomic write can no longer be serviced as requested and fail it
>> appropriately.
>
> Sure, and we do this. As I remember, we should return -EIO in this case.
>
Right, and that's the main motivation for this patch. The original
atomic write support already returned -EIO for one bad-block path, but
there are other cases where max_sectors can be reduced (e.g. the
first_bad <= sector path and write-behind limits)
After a4c55c902670, those cases can end up completing with EINVAL or
NOTSUPP instead. This patch is intended to restore consistent -EIO.
>>
>>>
>>>> Along the way, the bio may instead
>>>> fail due to other conditions and return a different status, even though
>>>> the request was invalid as an atomic write from the beginning.
>>>>
>
--
Best Regards,
Abd-Alrhman
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/7] md/raid1: handle atomic writes that require splitting
2026-06-23 10:06 ` Abd-Alrhman Masalkhi
@ 2026-06-23 11:38 ` John Garry
0 siblings, 0 replies; 13+ messages in thread
From: John Garry @ 2026-06-23 11:38 UTC (permalink / raw)
To: Abd-Alrhman Masalkhi, song, yukuai, magiclinan, xiao, axboe,
martin.petersen
Cc: linux-raid, linux-kernel
On 23/06/2026 11:06, Abd-Alrhman Masalkhi wrote:
> On Tue, Jun 23, 2026 at 10:20 +0100, John Garry wrote:
>> On 23/06/2026 09:58, Abd-Alrhman Masalkhi wrote:
>>> On Tue, Jun 23, 2026 at 09:11 +0100, John Garry wrote:
>>>> On 23/06/2026 08:24, Abd-Alrhman Masalkhi wrote:
>>>>> If a request already requires splitting when entering
>>>>> raid1_write_request(), the current code allows it to proceed until it
>>>>> eventually reaches the split path.
>>>>
>>>> The block layer should catch invalid atomic writes in
>>>> submit_bio_noacct() -> blk_validate_atomic_write_op_size() before we
>>>> even get as far as the md atomic write handling. Having the check in
>>>> bio_submit_split_bioset() is really just a fail-safe for the block layer
>>>> not catching invalid atomic writes or the atomic writes queue limits not
>>>> being properly calculated.
>>> The request size itself satisfies the currently advertised atomic write
>>> limits, so blk_validate_atomic_write_op_size() allows it. The problem
>>> is that RAID1 may further restrict atomic writes to a single barrier
>>> unit via align_to_barrier_unit_end(). Therefore a request that crosses
>>> a barrier-unit boundary can still reach raid1_write_request() with
>>> max_sectors < bio_sectors(bio).
>>>
>>> If the barrier-unit restriction should instead be advertised through the
>>> atomic write queue limits,
>>
>> It should. Any restrictions should be advertised up front. For the user
>> to issue an atomic write which is valid according to limits, then it
>> should succeed.
>>
>
> I'll take a look at how best to expose that through the queue limits and
> rework this part accordingly. If there is already an existing mechanism
> you had in mind, I'd appreciate any pointers.
Any write must fit within BARRIER_UNIT_SECTOR_SIZE, right?
Since an atomic write must be naturally aligned, then I would expect
that the atomic write max unit is limited by BARRIER_UNIT_SECTOR_SIZE.
>
>>> then I agree the block layer could reject
>>> such requests earlier and the RAID1 entry check would become
>>> unnecessary.
>>>
>>> However, there are also cases where max_sectors is reduced later within
>>> raid1_write_request(), for example when bad blocks are present on some
>>> mirrors (or due to other RAID1-specific constraints such as write-behind
>>> limits). Those reductions depend on RAID1 runtime state and mirror
>>> health, so they are not readily visible to the block layer during atomic
>>> write validation. In those cases RAID1 still needs to detect that the
>>> atomic write can no longer be serviced as requested and fail it
>>> appropriately.
>>
>> Sure, and we do this. As I remember, we should return -EIO in this case.
>>
>
> Right, and that's the main motivation for this patch. The original
> atomic write support already returned -EIO for one bad-block path, but
> there are other cases where max_sectors can be reduced (e.g. the
> first_bad <= sector path and write-behind limits)
>
> After a4c55c902670, those cases can end up completing with EINVAL or
> NOTSUPP instead. This patch is intended to restore consistent -EIO.
>
ok, but I could not check this as I did not recognize the baseline code.
>>>
>>>>
>>>>> Along the way, the bio may instead
>>>>> fail due to other conditions and return a different status, even though
>>>>> the request was invalid as an atomic write from the beginning.
>>>>>
>>
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 3/7] md/raid10: handle atomic writes that require splitting
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 1/7] md/raid10: fix r10bio leak in raid10_write_request() error paths Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 2/7] md/raid1: handle atomic writes that require splitting Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 4/7] md/raid10: raid10_write_request() drops the barrier before calling Abd-Alrhman Masalkhi
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel
If a request already requires splitting when entering
raid10_write_request(), the current code allows it to proceed until it
eventually reaches the split path. Along the way, the bio may instead
fail due to other conditions and return a different status, even though
the request was invalid as an atomic write from the beginning.
Additionally, an otherwise valid atomic write may later require
splitting because bad blocks reduce the writable range. In this case,
the bio currently completes with either EINVAL or EIO, whereas it should
complete with EIO consistently.
Fixes: a1d9b4fd42d9 ("md/raid10: Atomic write support")
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid10.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index bd322eccdc3f..840f0446c231 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1356,6 +1356,13 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
int i, k;
sector_t sectors;
int max_sectors;
+ bool atomic = bio->bi_opf & REQ_ATOMIC;
+
+ if (atomic && r10_bio->sectors != bio_sectors(bio)) {
+ bio_endio_status(bio, BLK_STS_INVAL);
+ free_r10bio(r10_bio);
+ return false;
+ }
if ((mddev_is_clustered(mddev) &&
mddev->cluster_ops->area_resyncing(mddev, WRITE,
@@ -1464,16 +1471,6 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
if (is_bad) {
int good_sectors;
- /*
- * We cannot atomically write this, so just
- * error in that case. It could be possible to
- * atomically write other mirrors, but the
- * complexity of supporting that is not worth
- * the benefit.
- */
- if (bio->bi_opf & REQ_ATOMIC)
- goto err_handle;
-
good_sectors = first_bad - dev_sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
@@ -1493,6 +1490,9 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->sectors = max_sectors;
if (r10_bio->sectors < bio_sectors(bio)) {
+ if (atomic)
+ goto err_handle;
+
allow_barrier(conf);
bio = bio_submit_split_bioset(bio, r10_bio->sectors,
&conf->bio_split);
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 4/7] md/raid10: raid10_write_request() drops the barrier before calling
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
` (2 preceding siblings ...)
2026-06-23 7:24 ` [PATCH 3/7] md/raid10: " Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 5/7] md/raid10: replace wait loop with wait_event_idle() Abd-Alrhman Masalkhi
` (2 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel
bio_submit_split_bioset() and reacquires it afterwards. This is
unnecessary because bio_submit_split_bioset() does not require
releasing the barrier protection.
Remove the redundant allow_barrier()/wait_barrier() pair around
bio_submit_split_bioset().
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid10.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 840f0446c231..4bc1d5553ec7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1493,10 +1493,8 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
if (atomic)
goto err_handle;
- allow_barrier(conf);
bio = bio_submit_split_bioset(bio, r10_bio->sectors,
&conf->bio_split);
- wait_barrier(conf, false);
if (!bio) {
set_bit(R10BIO_Returned, &r10_bio->state);
goto err_handle;
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 5/7] md/raid10: replace wait loop with wait_event_idle()
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
` (3 preceding siblings ...)
2026-06-23 7:24 ` [PATCH 4/7] md/raid10: raid10_write_request() drops the barrier before calling Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 6/7] md/raid10: simplify write request error handling Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 7/7] md/raid10: simplify read " Abd-Alrhman Masalkhi
6 siblings, 0 replies; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel
The wait loop is equivalent to wait_event_idle() and can be simplified
by usaing it for improving readability.
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid10.c | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4bc1d5553ec7..7085fd97b98a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1368,22 +1368,17 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))) {
- DEFINE_WAIT(w);
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
free_r10bio(r10_bio);
return false;
}
- for (;;) {
- prepare_to_wait(&conf->wait_barrier,
- &w, TASK_IDLE);
- if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector, bio_end_sector(bio)))
- break;
- schedule();
- }
- finish_wait(&conf->wait_barrier, &w);
+
+ wait_event_idle(conf->wait_barrier,
+ !mddev->cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio)));
}
sectors = r10_bio->sectors;
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 6/7] md/raid10: simplify write request error handling
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
` (4 preceding siblings ...)
2026-06-23 7:24 ` [PATCH 5/7] md/raid10: replace wait loop with wait_event_idle() Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
2026-06-23 7:24 ` [PATCH 7/7] md/raid10: simplify read " Abd-Alrhman Masalkhi
6 siblings, 0 replies; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel
raid10_write_request() currently handles bio completion, barrier
handling, and r10_bio lifetime management in several different error
paths. This results in duplicated cleanup logic and increases the risk
of introducing bugs in future modifications.
Move bio_wouldblock_error() handling to the callers of
regular_request_wait(), consolidate the write error paths, and free
r10_bio from a single location in __make_request() when
raid10_write_request() fails.
It remove redundant local copies of r10_bio->sectors and use a single
max_sectors variable throughout the function.
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid10.c | 61 +++++++++++++++++++++------------------------
1 file changed, 29 insertions(+), 32 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7085fd97b98a..2de898733337 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1123,18 +1123,16 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
struct bio *bio, sector_t sectors)
{
/* Bail out if REQ_NOWAIT is set for the bio */
- if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
- bio_wouldblock_error(bio);
+ if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT))
return false;
- }
+
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
allow_barrier(conf);
- if (bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
+ if (bio->bi_opf & REQ_NOWAIT)
return false;
- }
+
mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector ||
@@ -1192,6 +1190,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
+ bio_wouldblock_error(bio);
free_r10bio(r10_bio);
return;
}
@@ -1354,13 +1353,12 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
{
struct r10conf *conf = mddev->private;
int i, k;
- sector_t sectors;
- int max_sectors;
+ int max_sectors = r10_bio->sectors;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
bool atomic = bio->bi_opf & REQ_ATOMIC;
- if (atomic && r10_bio->sectors != bio_sectors(bio)) {
+ if (atomic && max_sectors != bio_sectors(bio)) {
bio_endio_status(bio, BLK_STS_INVAL);
- free_r10bio(r10_bio);
return false;
}
@@ -1369,9 +1367,8 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))) {
/* Bail out if REQ_NOWAIT is set for the bio */
- if (bio->bi_opf & REQ_NOWAIT) {
+ if (nowait) {
bio_wouldblock_error(bio);
- free_r10bio(r10_bio);
return false;
}
@@ -1381,28 +1378,25 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
bio_end_sector(bio)));
}
- sectors = r10_bio->sectors;
- if (!regular_request_wait(mddev, conf, bio, sectors)) {
- free_r10bio(r10_bio);
+ if (!regular_request_wait(mddev, conf, bio, max_sectors)) {
+ bio_wouldblock_error(bio);
return false;
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
- : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+ bio->bi_iter.bi_sector + max_sectors > conf->reshape_progress)
+ : (bio->bi_iter.bi_sector + max_sectors > conf->reshape_safe &&
bio->bi_iter.bi_sector < conf->reshape_progress))) {
/* Need to update reshape_position in metadata */
mddev->reshape_position = conf->reshape_progress;
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
md_wakeup_thread(mddev->thread);
- if (bio->bi_opf & REQ_NOWAIT) {
- allow_barrier(conf);
+ if (nowait) {
bio_wouldblock_error(bio);
- free_r10bio(r10_bio);
- return false;
+ goto err_allow_barrier;
}
mddev_add_trace_msg(conf->mddev,
"raid10 wait reshape metadata");
@@ -1427,8 +1421,6 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
wait_blocked_dev(mddev, r10_bio);
- max_sectors = r10_bio->sectors;
-
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev, *rrdev;
@@ -1485,15 +1477,15 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->sectors = max_sectors;
if (r10_bio->sectors < bio_sectors(bio)) {
- if (atomic)
- goto err_handle;
+ if (atomic) {
+ bio_io_error(bio);
+ goto err_dec_pending;
+ }
bio = bio_submit_split_bioset(bio, r10_bio->sectors,
&conf->bio_split);
- if (!bio) {
- set_bit(R10BIO_Returned, &r10_bio->state);
- goto err_handle;
- }
+ if (!bio)
+ goto err_dec_pending;
r10_bio->master_bio = bio;
}
@@ -1511,7 +1503,7 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
one_write_done(r10_bio);
return true;
-err_handle:
+err_dec_pending:
for (k = 0; k < i; k++) {
int d = r10_bio->devs[k].devnum;
struct md_rdev *rdev = conf->mirrors[d].rdev;
@@ -1527,7 +1519,9 @@ static bool raid10_write_request(struct mddev *mddev, struct bio *bio,
}
}
- raid_end_bio_io(r10_bio);
+err_allow_barrier:
+ allow_barrier(conf);
+
return false;
}
@@ -1552,8 +1546,11 @@ static bool __make_request(struct mddev *mddev, struct bio *bio, int sectors)
ret = true;
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
- else
+ else {
ret = raid10_write_request(mddev, bio, r10_bio);
+ if (!ret)
+ free_r10bio(r10_bio);
+ }
return ret;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 7/7] md/raid10: simplify read request error handling
2026-06-23 7:24 [PATCH 0/7] md/raid10: fixes, atomic write handling, and error-path cleanup Abd-Alrhman Masalkhi
` (5 preceding siblings ...)
2026-06-23 7:24 ` [PATCH 6/7] md/raid10: simplify write request error handling Abd-Alrhman Masalkhi
@ 2026-06-23 7:24 ` Abd-Alrhman Masalkhi
6 siblings, 0 replies; 13+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-23 7:24 UTC (permalink / raw)
To: song, yukuai, magiclinan, xiao, axboe, john.g.garry,
martin.petersen, abd.masalkhi
Cc: linux-raid, linux-kernel
raid10_read_request() currently handles bio completion, barrier
handling, and r10_bio lifetime management in several different error
paths. This results in duplicated cleanup logic and increases the risk
of introducing bugs in future modifications.
Make raid10_read_request() return a status to its callers, consolidate
the read error paths, and free r10_bio from a single location in the
callers. Since the callers allocate r10_bio, they should also be
responsible for freeing it when the request fails.
This makes the read path follow the same ownership model as the write
path and simplifies the error handling flow.
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
drivers/md/raid10.c | 45 +++++++++++++++++++++++++--------------------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2de898733337..830c0fe30b96 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1143,7 +1143,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
return true;
}
-static void raid10_read_request(struct mddev *mddev, struct bio *bio,
+static bool raid10_read_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
@@ -1191,8 +1191,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
bio_wouldblock_error(bio);
- free_r10bio(r10_bio);
- return;
+ return false;
}
rdev = read_balance(conf, r10_bio, &max_sectors);
@@ -1202,8 +1201,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
mdname(mddev), b,
(unsigned long long)r10_bio->sector);
}
- raid_end_bio_io(r10_bio);
- return;
+ bio_io_error(bio);
+ goto err_allow_barrier;
}
if (err_rdev)
pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n",
@@ -1215,10 +1214,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
bio = bio_submit_split_bioset(bio, max_sectors,
&conf->bio_split);
wait_barrier(conf, false);
- if (!bio) {
- set_bit(R10BIO_Returned, &r10_bio->state);
- goto err_handle;
- }
+ if (!bio)
+ goto err_dec_pending;
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
@@ -1244,10 +1241,16 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_private = r10_bio;
mddev_trace_remap(mddev, read_bio, r10_bio->sector);
submit_bio_noacct(read_bio);
- return;
-err_handle:
+
+ return true;
+
+err_dec_pending:
atomic_dec(&rdev->nr_pending);
- raid_end_bio_io(r10_bio);
+
+err_allow_barrier:
+ allow_barrier(conf);
+
+ return false;
}
static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
@@ -1543,14 +1546,13 @@ static bool __make_request(struct mddev *mddev, struct bio *bio, int sectors)
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
- ret = true;
if (bio_data_dir(bio) == READ)
- raid10_read_request(mddev, bio, r10_bio);
- else {
+ ret = raid10_read_request(mddev, bio, r10_bio);
+ else
ret = raid10_write_request(mddev, bio, r10_bio);
- if (!ret)
- free_r10bio(r10_bio);
- }
+
+ if (!ret)
+ free_r10bio(r10_bio);
return ret;
}
@@ -1880,6 +1882,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
int sectors = bio_sectors(bio);
+ bool write = bio_data_dir(bi) == WRITE;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
&& md_flush_request(mddev, bio))
@@ -1903,7 +1906,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
sectors = chunk_sects -
(bio->bi_iter.bi_sector &
(chunk_sects - 1));
- if (!__make_request(mddev, bio, sectors))
+ if (!__make_request(mddev, bio, sectors) && write)
md_write_end(mddev);
/* In case raid10d snuck in to freeze_array */
@@ -2871,7 +2874,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
rdev_dec_pending(rdev, mddev);
r10_bio->state = 0;
- raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+ if (!raid10_read_request(mddev, r10_bio->master_bio, r10_bio))
+ free_r10bio(r10_bio);
+
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
--
2.43.0
^ permalink raw reply related [flat|nested] 13+ messages in thread