[PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes
@ 2026-06-11  8:35 Abd-Alrhman Masalkhi
  2026-06-11  8:49 ` sashiko-bot
  2026-06-20 20:18 ` Yu Kuai
  0 siblings, 2 replies; 5+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-11  8:35 UTC (permalink / raw)
  To: song, yukuai, magiclinan, xiao, vverma, axboe
  Cc: linux-raid, linux-kernel, Abd-Alrhman Masalkhi

raid1 supports REQ_NOWAIT reads by avoiding waits in the barrier path
through wait_read_barrier(). However, a read can still block on a
WriteMostly device when the array uses a bitmap and there are
outstanding behind writes.

In that case raid1 unconditionally calls wait_behind_writes(), which
may sleep until all behind writes complete. As a result, a REQ_NOWAIT
read can block despite the caller explicitly requesting non-blocking
behavior.

This ensures that raid1 consistently honors REQ_NOWAIT reads across all
paths that may otherwise wait for behind writes.

Fixes: 5aa705039c4f ("md: raid1 add nowait support")
Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
---
 drivers/md/md-bitmap.c   |  9 +++++++--
 drivers/md/md-bitmap.h   |  2 +-
 drivers/md/md-llbitmap.c | 13 ++++++++-----
 drivers/md/md.c          |  2 +-
 drivers/md/raid1.c       | 10 +++++++---
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 028b9ca8ce52..1206e31f323a 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2063,18 +2063,23 @@ static void bitmap_end_behind_write(struct mddev *mddev)
 		 bitmap->mddev->bitmap_info.max_write_behind);
 }
 
-static void bitmap_wait_behind_writes(struct mddev *mddev)
+static bool bitmap_wait_behind_writes(struct mddev *mddev, bool nowait)
 {
 	struct bitmap *bitmap = mddev->bitmap;
 
 	/* wait for behind writes to complete */
 	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		if (nowait)
+			return false;
+
 		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
 			 mdname(mddev));
 		/* need to kick something here to make sure I/O goes? */
 		wait_event(bitmap->behind_wait,
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
+
+	return true;
 }
 
 static void bitmap_destroy(struct mddev *mddev)
@@ -2084,7 +2089,7 @@ static void bitmap_destroy(struct mddev *mddev)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
-	bitmap_wait_behind_writes(mddev);
+	bitmap_wait_behind_writes(mddev, false);
 	if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
 		mddev_destroy_serial_pool(mddev, NULL);
 
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 214f623c7e79..f46674bdfeb9 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -98,7 +98,7 @@ struct bitmap_operations {
 
 	void (*start_behind_write)(struct mddev *mddev);
 	void (*end_behind_write)(struct mddev *mddev);
-	void (*wait_behind_writes)(struct mddev *mddev);
+	bool (*wait_behind_writes)(struct mddev *mddev, bool nowait);
 
 	md_bitmap_fn *start_write;
 	md_bitmap_fn *end_write;
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
index 1adc5b117821..5a4e2abaa757 100644
--- a/drivers/md/md-llbitmap.c
+++ b/drivers/md/md-llbitmap.c
@@ -1574,16 +1574,19 @@ static void llbitmap_end_behind_write(struct mddev *mddev)
 		wake_up(&llbitmap->behind_wait);
 }
 
-static void llbitmap_wait_behind_writes(struct mddev *mddev)
+static bool llbitmap_wait_behind_writes(struct mddev *mddev, bool nowait)
 {
 	struct llbitmap *llbitmap = mddev->bitmap;
 
-	if (!llbitmap)
-		return;
+	if (llbitmap && atomic_read(&llbitmap->behind_writes) > 0) {
+		if (nowait)
+			return false;
 
-	wait_event(llbitmap->behind_wait,
-		   atomic_read(&llbitmap->behind_writes) == 0);
+		wait_event(llbitmap->behind_wait,
+			   atomic_read(&llbitmap->behind_writes) == 0);
+	}
 
+	return true;
 }
 
 static ssize_t bits_show(struct mddev *mddev, char *page)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 096bb64e87bd..d1465bcd86c8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7050,7 +7050,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
 static void mddev_detach(struct mddev *mddev)
 {
 	if (md_bitmap_enabled(mddev, false))
-		mddev->bitmap_ops->wait_behind_writes(mddev);
+		mddev->bitmap_ops->wait_behind_writes(mddev, false);
 	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
 		mddev->pers->quiesce(mddev, 1);
 		mddev->pers->quiesce(mddev, 0);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b1ed4cc6ade4..b2d7c13b64bd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1341,6 +1341,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	int max_sectors;
 	int rdisk;
 	bool r1bio_existed = !!r1_bio;
+	bool nowait = bio->bi_opf & REQ_NOWAIT;
 
 	/*
 	 * An md cloned bio indicates we are in the error path.
@@ -1360,8 +1361,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	 * Still need barrier for READ in case that whole
 	 * array is frozen.
 	 */
-	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
-				bio->bi_opf & REQ_NOWAIT)) {
+	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, nowait)) {
 		bio_wouldblock_error(bio);
 		return;
 	}
@@ -1402,7 +1402,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 		 * over-take any writes that are 'behind'
 		 */
 		mddev_add_trace_msg(mddev, "raid1 wait behind writes");
-		mddev->bitmap_ops->wait_behind_writes(mddev);
+		if (!mddev->bitmap_ops->wait_behind_writes(mddev, nowait)) {
+			bio_wouldblock_error(bio);
+			set_bit(R1BIO_Returned, &r1_bio->state);
+			goto err_handle;
+		}
 	}
 
 	if (max_sectors < bio_sectors(bio)) {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes
  2026-06-11  8:35 [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes Abd-Alrhman Masalkhi
@ 2026-06-11  8:49 ` sashiko-bot
  2026-06-11 10:49   ` Abd-Alrhman Masalkhi
  2026-06-20 20:18 ` Yu Kuai
  1 sibling, 1 reply; 5+ messages in thread
From: sashiko-bot @ 2026-06-11  8:49 UTC (permalink / raw)
  To: Abd-Alrhman Masalkhi; +Cc: yukuai, linux-raid

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:

Pre-existing issues:
- [High] Leaking pre-allocated `r1_bio` when `REQ_NOWAIT` reads fail `wait_read_barrier()` during retry.
- [High] Permanent barrier leak if a REQ_NOWAIT write encounters a blocked device.
- [Medium] REQ_NOWAIT semantics violation: memory allocations use GFP_NOIO unconditionally.
--

commit 7fed7389f4fe20e79eece974eaf3b8d317519e27
Author: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>

md/raid1: honor REQ_NOWAIT when waiting for behind writes

This commit ensures that raid1 consistently honors REQ_NOWAIT reads
across paths that may otherwise wait for behind writes. It modifies
bitmap_wait_behind_writes() to return early instead of sleeping if
the nowait flag is set.

> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 5b9368bd9e700..d03da046a5920 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
[ ... ]
> @@ -1360,8 +1361,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>  	 * Still need barrier for READ in case that whole
>  	 * array is frozen.
>  	 */
> -	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
> -				bio->bi_opf & REQ_NOWAIT)) {
> +	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, nowait)) {
>  		bio_wouldblock_error(bio);
>  		return;
>  	}

[Severity: High]
This is a pre-existing issue, but does this early return leak the 
pre-allocated r1_bio when wait_read_barrier() fails during a retry?

When a REQ_NOWAIT read fails its initial attempt, it is queued to the 
retry_list and handled by the raid1d thread via handle_read_error().

That function passes the existing, pre-allocated r1_bio back to 
raid1_read_request():

drivers/md/raid1.c:handle_read_error() {
    ...
    raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
    allow_barrier(conf, sector);
    ...
}

If the array is concurrently frozen, wait_read_barrier() returns false 
since REQ_NOWAIT is still set on the master bio. The code handles this 
by returning immediately, bypassing the err_handle cleanup path and 
failing to call raid_end_bio_io().

Since handle_read_error() assumes ownership was taken and only releases 
its old barrier, does this result in a permanent memory leak of the 
r1_bio from the r1bio_pool mempool?

[Severity: Medium]
This is another pre-existing issue, but does the allocation path for 
REQ_NOWAIT bios violate block layer non-blocking semantics by 
unconditionally using GFP_NOIO?

Even when REQ_NOWAIT is requested, alloc_r1bio() uses GFP_NOIO 
unconditionally:

drivers/md/raid1.c:alloc_r1bio() {
    ...
    r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
    ...
}

Similarly, bio_alloc_clone() in raid1_read_request() unconditionally 
uses gfp flags that include __GFP_DIRECT_RECLAIM.

Since mempool-backed allocations will sleep under memory pressure 
instead of failing immediately when these flags are used, will this 
cause unexpected high latencies and potential priority inversion in 
asynchronous contexts?

[Severity: High]
This isn't a bug introduced by this patch, but does 
raid1_write_request() permanently leak the barrier if a REQ_NOWAIT 
write encounters a blocked device?

In raid1_write_request(), the code successfully acquires a barrier by 
calling wait_barrier(), which internally increments conf->nr_pending.

Immediately after, it calls wait_blocked_rdev():

drivers/md/raid1.c:raid1_write_request() {
    ...
    if (!wait_blocked_rdev(mddev, bio)) {
        bio_wouldblock_error(bio);
        return;
    }
    ...
}

If the device is blocked and the request is REQ_NOWAIT, 
wait_blocked_rdev() returns false and the function returns early. 
However, it fails to call allow_barrier() before returning to drop 
the previously acquired barrier increment.

Because freeze_array() waits for nr_pending to reach 0, could this 
permanently prevent the array from ever freezing again, deadlocking 
resyncs, reconfiguration, and array teardown?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260611083514.754922-1-abd.masalkhi@gmail.com?part=1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes
  2026-06-11  8:49 ` sashiko-bot
@ 2026-06-11 10:49   ` Abd-Alrhman Masalkhi
  0 siblings, 0 replies; 5+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-11 10:49 UTC (permalink / raw)
  To: sashiko-reviews; +Cc: yukuai, linux-raid

On Thu, Jun 11, 2026 at 08:49 +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
>
> Pre-existing issues:
> - [High] Leaking pre-allocated `r1_bio` when `REQ_NOWAIT` reads fail `wait_read_barrier()` during retry.
> - [High] Permanent barrier leak if a REQ_NOWAIT write encounters a blocked device.
> - [Medium] REQ_NOWAIT semantics violation: memory allocations use GFP_NOIO unconditionally.

These appear to be pre-existing issues rather than being introduced by
this patch. I plan to address them in separate patches.

> --
>
> commit 7fed7389f4fe20e79eece974eaf3b8d317519e27
> Author: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
>
> md/raid1: honor REQ_NOWAIT when waiting for behind writes
>
> This commit ensures that raid1 consistently honors REQ_NOWAIT reads
> across paths that may otherwise wait for behind writes. It modifies
> bitmap_wait_behind_writes() to return early instead of sleeping if
> the nowait flag is set.
>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 5b9368bd9e700..d03da046a5920 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
> [ ... ]
>> @@ -1360,8 +1361,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>  	 * Still need barrier for READ in case that whole
>>  	 * array is frozen.
>>  	 */
>> -	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
>> -				bio->bi_opf & REQ_NOWAIT)) {
>> +	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, nowait)) {
>>  		bio_wouldblock_error(bio);
>>  		return;
>>  	}
>
> [Severity: High]
> This is a pre-existing issue, but does this early return leak the 
> pre-allocated r1_bio when wait_read_barrier() fails during a retry?
>
> When a REQ_NOWAIT read fails its initial attempt, it is queued to the 
> retry_list and handled by the raid1d thread via handle_read_error().
>
> That function passes the existing, pre-allocated r1_bio back to 
> raid1_read_request():
>
> drivers/md/raid1.c:handle_read_error() {
>     ...
>     raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
>     allow_barrier(conf, sector);
>     ...
> }
>
> If the array is concurrently frozen, wait_read_barrier() returns false 
> since REQ_NOWAIT is still set on the master bio. The code handles this 
> by returning immediately, bypassing the err_handle cleanup path and 
> failing to call raid_end_bio_io().
>
> Since handle_read_error() assumes ownership was taken and only releases 
> its old barrier, does this result in a permanent memory leak of the 
> r1_bio from the r1bio_pool mempool?
>
> [Severity: Medium]
> This is another pre-existing issue, but does the allocation path for 
> REQ_NOWAIT bios violate block layer non-blocking semantics by 
> unconditionally using GFP_NOIO?
>
> Even when REQ_NOWAIT is requested, alloc_r1bio() uses GFP_NOIO 
> unconditionally:
>
> drivers/md/raid1.c:alloc_r1bio() {
>     ...
>     r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
>     ...
> }
>
> Similarly, bio_alloc_clone() in raid1_read_request() unconditionally 
> uses gfp flags that include __GFP_DIRECT_RECLAIM.
>
> Since mempool-backed allocations will sleep under memory pressure 
> instead of failing immediately when these flags are used, will this 
> cause unexpected high latencies and potential priority inversion in 
> asynchronous contexts?
>
> [Severity: High]
> This isn't a bug introduced by this patch, but does 
> raid1_write_request() permanently leak the barrier if a REQ_NOWAIT 
> write encounters a blocked device?
>
> In raid1_write_request(), the code successfully acquires a barrier by 
> calling wait_barrier(), which internally increments conf->nr_pending.
>
> Immediately after, it calls wait_blocked_rdev():
>
> drivers/md/raid1.c:raid1_write_request() {
>     ...
>     if (!wait_blocked_rdev(mddev, bio)) {
>         bio_wouldblock_error(bio);
>         return;
>     }
>     ...
> }
>
> If the device is blocked and the request is REQ_NOWAIT, 
> wait_blocked_rdev() returns false and the function returns early. 
> However, it fails to call allow_barrier() before returning to drop 
> the previously acquired barrier increment.
>
> Because freeze_array() waits for nr_pending to reach 0, could this 
> permanently prevent the array from ever freezing again, deadlocking 
> resyncs, reconfiguration, and array teardown?
>
> -- 
> Sashiko AI review · https://sashiko.dev/#/patchset/20260611083514.754922-1-abd.masalkhi@gmail.com?part=1

-- 
Best Regards,
Abd-Alrhman

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes
  2026-06-11  8:35 [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes Abd-Alrhman Masalkhi
  2026-06-11  8:49 ` sashiko-bot
@ 2026-06-20 20:18 ` Yu Kuai
  2026-06-21 18:08   ` Abd-Alrhman Masalkhi
  1 sibling, 1 reply; 5+ messages in thread
From: Yu Kuai @ 2026-06-20 20:18 UTC (permalink / raw)
  To: Abd-Alrhman Masalkhi, song, yukuai, magiclinan, xiao, vverma,
	axboe
  Cc: linux-raid, linux-kernel, yukuai

Hi,

在 2026/6/11 16:35, Abd-Alrhman Masalkhi 写道:
> raid1 supports REQ_NOWAIT reads by avoiding waits in the barrier path
> through wait_read_barrier(). However, a read can still block on a
> WriteMostly device when the array uses a bitmap and there are
> outstanding behind writes.
>
> In that case raid1 unconditionally calls wait_behind_writes(), which
> may sleep until all behind writes complete. As a result, a REQ_NOWAIT
> read can block despite the caller explicitly requesting non-blocking
> behavior.
>
> This ensures that raid1 consistently honors REQ_NOWAIT reads across all
> paths that may otherwise wait for behind writes.
>
> Fixes: 5aa705039c4f ("md: raid1 add nowait support")
> Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
> ---
>   drivers/md/md-bitmap.c   |  9 +++++++--
>   drivers/md/md-bitmap.h   |  2 +-
>   drivers/md/md-llbitmap.c | 13 ++++++++-----
>   drivers/md/md.c          |  2 +-
>   drivers/md/raid1.c       | 10 +++++++---
>   5 files changed, 24 insertions(+), 12 deletions(-)

Applied to md-7.2.

One note, remove nowait support is already in my to-do lists. There is a case
that can't be handled for now, for example:

2 disk raid1, issue no wait write IO, rdev1 succeed while rdev2 failed. In this
case, we don't know if rdev2 failed issue because nowait or disk really have
badblocks. So we can't either record badblocks or issue again without nowait,
for consequence, rdev1 and rdev2 will end up with different data.

I think the best solution is to remove nowait support for mdraid. Feel free
to cook a patch or if you have something else in mind.

>
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index 028b9ca8ce52..1206e31f323a 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -2063,18 +2063,23 @@ static void bitmap_end_behind_write(struct mddev *mddev)
>   		 bitmap->mddev->bitmap_info.max_write_behind);
>   }
>   
> -static void bitmap_wait_behind_writes(struct mddev *mddev)
> +static bool bitmap_wait_behind_writes(struct mddev *mddev, bool nowait)
>   {
>   	struct bitmap *bitmap = mddev->bitmap;
>   
>   	/* wait for behind writes to complete */
>   	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
> +		if (nowait)
> +			return false;
> +
>   		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
>   			 mdname(mddev));
>   		/* need to kick something here to make sure I/O goes? */
>   		wait_event(bitmap->behind_wait,
>   			   atomic_read(&bitmap->behind_writes) == 0);
>   	}
> +
> +	return true;
>   }
>   
>   static void bitmap_destroy(struct mddev *mddev)
> @@ -2084,7 +2089,7 @@ static void bitmap_destroy(struct mddev *mddev)
>   	if (!bitmap) /* there was no bitmap */
>   		return;
>   
> -	bitmap_wait_behind_writes(mddev);
> +	bitmap_wait_behind_writes(mddev, false);
>   	if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
>   		mddev_destroy_serial_pool(mddev, NULL);
>   
> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> index 214f623c7e79..f46674bdfeb9 100644
> --- a/drivers/md/md-bitmap.h
> +++ b/drivers/md/md-bitmap.h
> @@ -98,7 +98,7 @@ struct bitmap_operations {
>   
>   	void (*start_behind_write)(struct mddev *mddev);
>   	void (*end_behind_write)(struct mddev *mddev);
> -	void (*wait_behind_writes)(struct mddev *mddev);
> +	bool (*wait_behind_writes)(struct mddev *mddev, bool nowait);
>   
>   	md_bitmap_fn *start_write;
>   	md_bitmap_fn *end_write;
> diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
> index 1adc5b117821..5a4e2abaa757 100644
> --- a/drivers/md/md-llbitmap.c
> +++ b/drivers/md/md-llbitmap.c
> @@ -1574,16 +1574,19 @@ static void llbitmap_end_behind_write(struct mddev *mddev)
>   		wake_up(&llbitmap->behind_wait);
>   }
>   
> -static void llbitmap_wait_behind_writes(struct mddev *mddev)
> +static bool llbitmap_wait_behind_writes(struct mddev *mddev, bool nowait)
>   {
>   	struct llbitmap *llbitmap = mddev->bitmap;
>   
> -	if (!llbitmap)
> -		return;
> +	if (llbitmap && atomic_read(&llbitmap->behind_writes) > 0) {
> +		if (nowait)
> +			return false;
>   
> -	wait_event(llbitmap->behind_wait,
> -		   atomic_read(&llbitmap->behind_writes) == 0);
> +		wait_event(llbitmap->behind_wait,
> +			   atomic_read(&llbitmap->behind_writes) == 0);
> +	}
>   
> +	return true;
>   }
>   
>   static ssize_t bits_show(struct mddev *mddev, char *page)
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 096bb64e87bd..d1465bcd86c8 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7050,7 +7050,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
>   static void mddev_detach(struct mddev *mddev)
>   {
>   	if (md_bitmap_enabled(mddev, false))
> -		mddev->bitmap_ops->wait_behind_writes(mddev);
> +		mddev->bitmap_ops->wait_behind_writes(mddev, false);
>   	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
>   		mddev->pers->quiesce(mddev, 1);
>   		mddev->pers->quiesce(mddev, 0);
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index b1ed4cc6ade4..b2d7c13b64bd 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1341,6 +1341,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>   	int max_sectors;
>   	int rdisk;
>   	bool r1bio_existed = !!r1_bio;
> +	bool nowait = bio->bi_opf & REQ_NOWAIT;
>   
>   	/*
>   	 * An md cloned bio indicates we are in the error path.
> @@ -1360,8 +1361,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>   	 * Still need barrier for READ in case that whole
>   	 * array is frozen.
>   	 */
> -	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
> -				bio->bi_opf & REQ_NOWAIT)) {
> +	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, nowait)) {
>   		bio_wouldblock_error(bio);
>   		return;
>   	}
> @@ -1402,7 +1402,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>   		 * over-take any writes that are 'behind'
>   		 */
>   		mddev_add_trace_msg(mddev, "raid1 wait behind writes");
> -		mddev->bitmap_ops->wait_behind_writes(mddev);
> +		if (!mddev->bitmap_ops->wait_behind_writes(mddev, nowait)) {
> +			bio_wouldblock_error(bio);
> +			set_bit(R1BIO_Returned, &r1_bio->state);
> +			goto err_handle;
> +		}
>   	}
>   
>   	if (max_sectors < bio_sectors(bio)) {

-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes
  2026-06-20 20:18 ` Yu Kuai
@ 2026-06-21 18:08   ` Abd-Alrhman Masalkhi
  0 siblings, 0 replies; 5+ messages in thread
From: Abd-Alrhman Masalkhi @ 2026-06-21 18:08 UTC (permalink / raw)
  To: Yu Kuai, song, yukuai, magiclinan, xiao, vverma, axboe
  Cc: linux-raid, linux-kernel, yukuai


Hi Kuai,

On Sun, Jun 21, 2026 at 04:18 +0800, Yu Kuai wrote:
> Hi,
>
> 在 2026/6/11 16:35, Abd-Alrhman Masalkhi 写道:
>> raid1 supports REQ_NOWAIT reads by avoiding waits in the barrier path
>> through wait_read_barrier(). However, a read can still block on a
>> WriteMostly device when the array uses a bitmap and there are
>> outstanding behind writes.
>>
>> In that case raid1 unconditionally calls wait_behind_writes(), which
>> may sleep until all behind writes complete. As a result, a REQ_NOWAIT
>> read can block despite the caller explicitly requesting non-blocking
>> behavior.
>>
>> This ensures that raid1 consistently honors REQ_NOWAIT reads across all
>> paths that may otherwise wait for behind writes.
>>
>> Fixes: 5aa705039c4f ("md: raid1 add nowait support")
>> Signed-off-by: Abd-Alrhman Masalkhi <abd.masalkhi@gmail.com>
>> ---
>>   drivers/md/md-bitmap.c   |  9 +++++++--
>>   drivers/md/md-bitmap.h   |  2 +-
>>   drivers/md/md-llbitmap.c | 13 ++++++++-----
>>   drivers/md/md.c          |  2 +-
>>   drivers/md/raid1.c       | 10 +++++++---
>>   5 files changed, 24 insertions(+), 12 deletions(-)
>
> Applied to md-7.2.
>
> One note, remove nowait support is already in my to-do lists. There is a case
> that can't be handled for now, for example:
>
> 2 disk raid1, issue no wait write IO, rdev1 succeed while rdev2 failed. In this
> case, we don't know if rdev2 failed issue because nowait or disk really have
> badblocks. So we can't either record badblocks or issue again without nowait,
> for consequence, rdev1 and rdev2 will end up with different data.
>
> I think the best solution is to remove nowait support for mdraid. Feel free
> to cook a patch or if you have something else in mind.
>

What if on a partial nowait failure we just end the master bio as
success, but set NEEDED on that chunk's bitmap counter and start_sync()
picks it up for resync? That way we don't have to decide why rdev2
failed at all, resync just copies from rdev1 to rdev2 without nowait,
so if it's a real bad block, end_sync_write() records it then.

We kinda have the idea of ending the bio before the write lands on all
mirrors in write-behind already, though it's not quite the same, there
the deferred write still lands, here we ACK after it already failed and
lean on resync to redo it.

My worry is the loaded case: AGAIN under queue pressure isn't that rare,
so we might end up triggering resync frequently. Do you think that's
acceptable, or is dropping nowait better? Happy to prototype either way.

>>
>> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
>> index 028b9ca8ce52..1206e31f323a 100644
>> --- a/drivers/md/md-bitmap.c
>> +++ b/drivers/md/md-bitmap.c
>> @@ -2063,18 +2063,23 @@ static void bitmap_end_behind_write(struct mddev *mddev)
>>   		 bitmap->mddev->bitmap_info.max_write_behind);
>>   }
>>   
>> -static void bitmap_wait_behind_writes(struct mddev *mddev)
>> +static bool bitmap_wait_behind_writes(struct mddev *mddev, bool nowait)
>>   {
>>   	struct bitmap *bitmap = mddev->bitmap;
>>   
>>   	/* wait for behind writes to complete */
>>   	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
>> +		if (nowait)
>> +			return false;
>> +
>>   		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
>>   			 mdname(mddev));
>>   		/* need to kick something here to make sure I/O goes? */
>>   		wait_event(bitmap->behind_wait,
>>   			   atomic_read(&bitmap->behind_writes) == 0);
>>   	}
>> +
>> +	return true;
>>   }
>>   
>>   static void bitmap_destroy(struct mddev *mddev)
>> @@ -2084,7 +2089,7 @@ static void bitmap_destroy(struct mddev *mddev)
>>   	if (!bitmap) /* there was no bitmap */
>>   		return;
>>   
>> -	bitmap_wait_behind_writes(mddev);
>> +	bitmap_wait_behind_writes(mddev, false);
>>   	if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
>>   		mddev_destroy_serial_pool(mddev, NULL);
>>   
>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
>> index 214f623c7e79..f46674bdfeb9 100644
>> --- a/drivers/md/md-bitmap.h
>> +++ b/drivers/md/md-bitmap.h
>> @@ -98,7 +98,7 @@ struct bitmap_operations {
>>   
>>   	void (*start_behind_write)(struct mddev *mddev);
>>   	void (*end_behind_write)(struct mddev *mddev);
>> -	void (*wait_behind_writes)(struct mddev *mddev);
>> +	bool (*wait_behind_writes)(struct mddev *mddev, bool nowait);
>>   
>>   	md_bitmap_fn *start_write;
>>   	md_bitmap_fn *end_write;
>> diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
>> index 1adc5b117821..5a4e2abaa757 100644
>> --- a/drivers/md/md-llbitmap.c
>> +++ b/drivers/md/md-llbitmap.c
>> @@ -1574,16 +1574,19 @@ static void llbitmap_end_behind_write(struct mddev *mddev)
>>   		wake_up(&llbitmap->behind_wait);
>>   }
>>   
>> -static void llbitmap_wait_behind_writes(struct mddev *mddev)
>> +static bool llbitmap_wait_behind_writes(struct mddev *mddev, bool nowait)
>>   {
>>   	struct llbitmap *llbitmap = mddev->bitmap;
>>   
>> -	if (!llbitmap)
>> -		return;
>> +	if (llbitmap && atomic_read(&llbitmap->behind_writes) > 0) {
>> +		if (nowait)
>> +			return false;
>>   
>> -	wait_event(llbitmap->behind_wait,
>> -		   atomic_read(&llbitmap->behind_writes) == 0);
>> +		wait_event(llbitmap->behind_wait,
>> +			   atomic_read(&llbitmap->behind_writes) == 0);
>> +	}
>>   
>> +	return true;
>>   }
>>   
>>   static ssize_t bits_show(struct mddev *mddev, char *page)
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 096bb64e87bd..d1465bcd86c8 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -7050,7 +7050,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
>>   static void mddev_detach(struct mddev *mddev)
>>   {
>>   	if (md_bitmap_enabled(mddev, false))
>> -		mddev->bitmap_ops->wait_behind_writes(mddev);
>> +		mddev->bitmap_ops->wait_behind_writes(mddev, false);
>>   	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
>>   		mddev->pers->quiesce(mddev, 1);
>>   		mddev->pers->quiesce(mddev, 0);
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index b1ed4cc6ade4..b2d7c13b64bd 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1341,6 +1341,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>   	int max_sectors;
>>   	int rdisk;
>>   	bool r1bio_existed = !!r1_bio;
>> +	bool nowait = bio->bi_opf & REQ_NOWAIT;
>>   
>>   	/*
>>   	 * An md cloned bio indicates we are in the error path.
>> @@ -1360,8 +1361,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>   	 * Still need barrier for READ in case that whole
>>   	 * array is frozen.
>>   	 */
>> -	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
>> -				bio->bi_opf & REQ_NOWAIT)) {
>> +	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, nowait)) {
>>   		bio_wouldblock_error(bio);
>>   		return;
>>   	}
>> @@ -1402,7 +1402,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>   		 * over-take any writes that are 'behind'
>>   		 */
>>   		mddev_add_trace_msg(mddev, "raid1 wait behind writes");
>> -		mddev->bitmap_ops->wait_behind_writes(mddev);
>> +		if (!mddev->bitmap_ops->wait_behind_writes(mddev, nowait)) {
>> +			bio_wouldblock_error(bio);
>> +			set_bit(R1BIO_Returned, &r1_bio->state);
>> +			goto err_handle;
>> +		}
>>   	}
>>   
>>   	if (max_sectors < bio_sectors(bio)) {
>
> -- 
> Thanks,
> Kuai

-- 
Best Regards,
Abd-Alrhman

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-06-21 18:09 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-11  8:35 [PATCH] md/raid1: honor REQ_NOWAIT when waiting for behind writes Abd-Alrhman Masalkhi
2026-06-11  8:49 ` sashiko-bot
2026-06-11 10:49   ` Abd-Alrhman Masalkhi
2026-06-20 20:18 ` Yu Kuai
2026-06-21 18:08   ` Abd-Alrhman Masalkhi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.