Linux RAID subsystem development
 help / color / mirror / Atom feed
* [PATCH v2] md/raid5: read batch_head under stripe_lock in make_stripe_request
@ 2026-06-19  8:11 Chen Cheng
  2026-06-20 22:01 ` yu kuai
  0 siblings, 1 reply; 3+ messages in thread
From: Chen Cheng @ 2026-06-19  8:11 UTC (permalink / raw)
  To: linux-raid, yukuai, yukuai; +Cc: chencheng, linux-kernel

From: Chen Cheng <chencheng@fnnas.com>

KCSAN reports race in raid5_make_request() vs. stripe_add_to_batch_list()

Writer flow (stripe_add_to_batch_list):
1. grab `head` stripe;
2. lock_two_stripes(head, sh);
3. re-check stripe_can_batch() for both head and sh, which requires
   STRIPE_BATCH_READY set on both;
4. write head->batch_head = head and sh->batch_head = head;
5. unlock_two_stripes.

STRIPE_BATCH_READY is cleared in two places:
- clear_batch_ready(), at the entry of handle_stripe();
- __add_stripe_bio(), for non-batchable bios.
And, both need to acquire `stripe_lock`.

Under stripe_lock, if STRIPE_BATCH_READY is clear, then:
- New writers cannot install a batch_head;
- Existing writers have already finished.
So .. handle_stripe() readers can ready `batch_head` locklessly.

Fix way:
Writer side make_stripe_request() under STRIPE_BATCH_READY, so , need
to be protected by stripe_lock when read something..

v1 -> v2:
- re-expalin how stripe_lock and batch_head work in commit message , and ,
- modify comment in raid5.h.

Fixs: f4aec6a097387


KCSAN report:
======================================
  BUG: KCSAN: data-race in raid5_make_request / raid5_make_request

  write to 0xffff8f03062432d8 of 8 bytes by task 210246 on cpu 6:
    raid5_make_request+0x175e/0x2ab0
    md_handle_request+0x2c5/0x700
    md_submit_bio+0x126/0x320
    [.........]
    btrfs_sync_file+0x181/0x970
    vfs_fsync_range+0x71/0x110
    do_fsync+0x46/0xa0
    __x64_sys_fsync+0x20/0x30

  read to 0xffff8f03062432d8 of 8 bytes by task 210251 on cpu 0:
    raid5_make_request+0x7c7/0x2ab0
    md_handle_request+0x2c5/0x700
    md_submit_bio+0x126/0x320
    [.........]
    btrfs_remap_file_range+0x266/0x980
    vfs_clone_file_range+0x16d/0x610
    ioctl_file_clone+0x64/0xd0
    do_vfs_ioctl+0x87f/0xbc0
    __x64_sys_ioctl+0xb8/0x130

  value changed: 0x0000000000000000 -> 0xffff8f0307798728


Signed-off-by: Chen Cheng <chencheng@fnnas.com>
---
 drivers/md/raid5.c | 2 ++
 drivers/md/raid5.h | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5521051a9425..efc63740f867 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6108,14 +6108,16 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
 		ctx->do_flush = false;
 	}
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
+	spin_lock_irq(&sh->stripe_lock);
 	if ((!sh->batch_head || sh == sh->batch_head) &&
 	    (bi->bi_opf & REQ_SYNC) &&
 	    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 		atomic_inc(&conf->preread_active_stripes);
+	spin_unlock_irq(&sh->stripe_lock);
 
 	release_stripe_plug(mddev, sh);
 	return STRIPE_SUCCESS;
 
 out_release:
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1c7b710fc9c1..9ff825697ba3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -221,11 +221,17 @@ struct stripe_head {
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
 	int			cpu;
 	struct r5worker_group	*group;
 
-	struct stripe_head	*batch_head; /* protected by stripe lock */
+	/*
+	 * Writer protected by stripe_lock.
+	 * Reader hold stripe_lock when STRIPE_BATCH_READY is set.
+	 * Without STRIPE_BATCH_READY means no concurrent write,
+	 * lockless read is ok.
+	 */
+	struct stripe_head	*batch_head;
 	spinlock_t		batch_lock; /* only header's lock is useful */
 	struct list_head	batch_list; /* protected by head's batch lock*/
 
 	union {
 		struct r5l_io_unit	*log_io;
-- 
2.54.0

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] md/raid5: read batch_head under stripe_lock in make_stripe_request
  2026-06-19  8:11 [PATCH v2] md/raid5: read batch_head under stripe_lock in make_stripe_request Chen Cheng
@ 2026-06-20 22:01 ` yu kuai
  2026-06-22  3:08   ` Chen Cheng
  0 siblings, 1 reply; 3+ messages in thread
From: yu kuai @ 2026-06-20 22:01 UTC (permalink / raw)
  To: Chen Cheng, linux-raid, yukuai; +Cc: linux-kernel

Hi,

在 2026/6/19 16:11, Chen Cheng 写道:
> From: Chen Cheng <chencheng@fnnas.com>
>
> KCSAN reports race in raid5_make_request() vs. stripe_add_to_batch_list()
>
> Writer flow (stripe_add_to_batch_list):
> 1. grab `head` stripe;
> 2. lock_two_stripes(head, sh);
> 3. re-check stripe_can_batch() for both head and sh, which requires
>     STRIPE_BATCH_READY set on both;
> 4. write head->batch_head = head and sh->batch_head = head;
> 5. unlock_two_stripes.
>
> STRIPE_BATCH_READY is cleared in two places:
> - clear_batch_ready(), at the entry of handle_stripe();
> - __add_stripe_bio(), for non-batchable bios.
> And, both need to acquire `stripe_lock`.
>
> Under stripe_lock, if STRIPE_BATCH_READY is clear, then:
> - New writers cannot install a batch_head;
> - Existing writers have already finished.
> So .. handle_stripe() readers can ready `batch_head` locklessly.

This does not explain the race clearly, I still have no clue yet.

>
> Fix way:
> Writer side make_stripe_request() under STRIPE_BATCH_READY, so , need
> to be protected by stripe_lock when read something..
>
> v1 -> v2:
> - re-expalin how stripe_lock and batch_head work in commit message , and ,
> - modify comment in raid5.h.
>
> Fixs: f4aec6a097387

Weird fix tag again.

>
>
> KCSAN report:
> ======================================
>    BUG: KCSAN: data-race in raid5_make_request / raid5_make_request
>
>    write to 0xffff8f03062432d8 of 8 bytes by task 210246 on cpu 6:
>      raid5_make_request+0x175e/0x2ab0
>      md_handle_request+0x2c5/0x700
>      md_submit_bio+0x126/0x320
>      [.........]
>      btrfs_sync_file+0x181/0x970
>      vfs_fsync_range+0x71/0x110
>      do_fsync+0x46/0xa0
>      __x64_sys_fsync+0x20/0x30
>
>    read to 0xffff8f03062432d8 of 8 bytes by task 210251 on cpu 0:
>      raid5_make_request+0x7c7/0x2ab0
>      md_handle_request+0x2c5/0x700
>      md_submit_bio+0x126/0x320
>      [.........]
>      btrfs_remap_file_range+0x266/0x980
>      vfs_clone_file_range+0x16d/0x610
>      ioctl_file_clone+0x64/0xd0
>      do_vfs_ioctl+0x87f/0xbc0
>      __x64_sys_ioctl+0xb8/0x130
>
>    value changed: 0x0000000000000000 -> 0xffff8f0307798728

Is this a mismatch report?

>
>
> Signed-off-by: Chen Cheng <chencheng@fnnas.com>
> ---
>   drivers/md/raid5.c | 2 ++
>   drivers/md/raid5.h | 8 +++++++-
>   2 files changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 5521051a9425..efc63740f867 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6108,14 +6108,16 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
>   		ctx->do_flush = false;
>   	}
>   
>   	set_bit(STRIPE_HANDLE, &sh->state);
>   	clear_bit(STRIPE_DELAYED, &sh->state);
> +	spin_lock_irq(&sh->stripe_lock);
>   	if ((!sh->batch_head || sh == sh->batch_head) &&
>   	    (bi->bi_opf & REQ_SYNC) &&
>   	    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>   		atomic_inc(&conf->preread_active_stripes);
> +	spin_unlock_irq(&sh->stripe_lock);
>   
>   	release_stripe_plug(mddev, sh);
>   	return STRIPE_SUCCESS;
>   
>   out_release:
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 1c7b710fc9c1..9ff825697ba3 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -221,11 +221,17 @@ struct stripe_head {
>   	enum reconstruct_states reconstruct_state;
>   	spinlock_t		stripe_lock;
>   	int			cpu;
>   	struct r5worker_group	*group;
>   
> -	struct stripe_head	*batch_head; /* protected by stripe lock */
> +	/*
> +	 * Writer protected by stripe_lock.
> +	 * Reader hold stripe_lock when STRIPE_BATCH_READY is set.
> +	 * Without STRIPE_BATCH_READY means no concurrent write,
> +	 * lockless read is ok.
> +	 */
> +	struct stripe_head	*batch_head;
>   	spinlock_t		batch_lock; /* only header's lock is useful */
>   	struct list_head	batch_list; /* protected by head's batch lock*/
>   
>   	union {
>   		struct r5l_io_unit	*log_io;

-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] md/raid5: read batch_head under stripe_lock in make_stripe_request
  2026-06-20 22:01 ` yu kuai
@ 2026-06-22  3:08   ` Chen Cheng
  0 siblings, 0 replies; 3+ messages in thread
From: Chen Cheng @ 2026-06-22  3:08 UTC (permalink / raw)
  To: yukuai, linux-raid; +Cc: linux-kernel

在 2026/6/21 06:01, yu kuai 写道:
> Hi,
> 
> 在 2026/6/19 16:11, Chen Cheng 写道:
>> From: Chen Cheng <chencheng@fnnas.com>
>>
>> KCSAN reports race in raid5_make_request() vs. stripe_add_to_batch_list()
>>
>> Writer flow (stripe_add_to_batch_list):
>> 1. grab `head` stripe;
>> 2. lock_two_stripes(head, sh);
>> 3. re-check stripe_can_batch() for both head and sh, which requires
>>      STRIPE_BATCH_READY set on both;
>> 4. write head->batch_head = head and sh->batch_head = head;
>> 5. unlock_two_stripes.
>>
>> STRIPE_BATCH_READY is cleared in two places:
>> - clear_batch_ready(), at the entry of handle_stripe();
>> - __add_stripe_bio(), for non-batchable bios.
>> And, both need to acquire `stripe_lock`.
>>
>> Under stripe_lock, if STRIPE_BATCH_READY is clear, then:
>> - New writers cannot install a batch_head;
>> - Existing writers have already finished.
>> So .. handle_stripe() readers can ready `batch_head` locklessly.
> 
> This does not explain the race clearly, I still have no clue yet.


 From the semantic correctness perspective, I think the lock is needed.

 From the race consequence perspective, the worst consequence I can see 
is that it could add to a batch member stripe. But 
`conf->preread_active_stripes` should only add to batch head or lone stripe.



the scenario:
=========================
sh1 and sh2 are neighbor, wich means,
if sh1 start with sector X, then, sh2 start with sectorX + STRIPE_SECTORS,

CPU0                                    CPU1
make_stripe_request(sh2)
-> add_all_stripe_bios(sh2)             make_stripe_request(sh2)
                                         -> add_all_stripe_bios(sh2)
                                         -> stripe_add_to_batch_list(sh2)
                                            -> lock_two_stripes(sh1, sh2)
                                            -> sh1->batch_head = sh1
                                            -> sh2->batch_head = sh1
                                            -> test_and_clear_bit(
                                                 STRIPE_PREREAD_ACTIVE,
                                                 &sh2->state)
                                            -> unlock_two_stripes(sh1, sh2)
-> if ((!sh2->batch_head ||
         sh2 == sh2->batch_head) &&
        REQ_SYNC &&
        !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh2->state))
        atomic_inc(&conf->preread_active_stripes)




After CPU2 batches 'sh', CPU1 can still treat it as a lone stripe and
charge preread_active_stripes. Since CPU2 has already run the follower
side compensation, the later increment has no matching decrement.



> 
>>
>> Fix way:
>> Writer side make_stripe_request() under STRIPE_BATCH_READY, so , need
>> to be protected by stripe_lock when read something..
>>
>> v1 -> v2:
>> - re-expalin how stripe_lock and batch_head work in commit message , and ,
>> - modify comment in raid5.h.
>>
>> Fixs: f4aec6a097387
> 
> Weird fix tag again.
> 
>>
>>
>> KCSAN report:
>> ======================================
>>     BUG: KCSAN: data-race in raid5_make_request / raid5_make_request
>>
>>     write to 0xffff8f03062432d8 of 8 bytes by task 210246 on cpu 6:
>>       raid5_make_request+0x175e/0x2ab0
>>       md_handle_request+0x2c5/0x700
>>       md_submit_bio+0x126/0x320
>>       [.........]
>>       btrfs_sync_file+0x181/0x970
>>       vfs_fsync_range+0x71/0x110
>>       do_fsync+0x46/0xa0
>>       __x64_sys_fsync+0x20/0x30
>>
>>     read to 0xffff8f03062432d8 of 8 bytes by task 210251 on cpu 0:
>>       raid5_make_request+0x7c7/0x2ab0
>>       md_handle_request+0x2c5/0x700
>>       md_submit_bio+0x126/0x320
>>       [.........]
>>       btrfs_remap_file_range+0x266/0x980
>>       vfs_clone_file_range+0x16d/0x610
>>       ioctl_file_clone+0x64/0xd0
>>       do_vfs_ioctl+0x87f/0xbc0
>>       __x64_sys_ioctl+0xb8/0x130
>>
>>     value changed: 0x0000000000000000 -> 0xffff8f0307798728
> 
> Is this a mismatch report?
> 
>>
>>
>> Signed-off-by: Chen Cheng <chencheng@fnnas.com>
>> ---
>>    drivers/md/raid5.c | 2 ++
>>    drivers/md/raid5.h | 8 +++++++-
>>    2 files changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 5521051a9425..efc63740f867 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -6108,14 +6108,16 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
>>    		ctx->do_flush = false;
>>    	}
>>    
>>    	set_bit(STRIPE_HANDLE, &sh->state);
>>    	clear_bit(STRIPE_DELAYED, &sh->state);
>> +	spin_lock_irq(&sh->stripe_lock);
>>    	if ((!sh->batch_head || sh == sh->batch_head) &&
>>    	    (bi->bi_opf & REQ_SYNC) &&
>>    	    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>>    		atomic_inc(&conf->preread_active_stripes);
>> +	spin_unlock_irq(&sh->stripe_lock);
>>    
>>    	release_stripe_plug(mddev, sh);
>>    	return STRIPE_SUCCESS;
>>    
>>    out_release:
>> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
>> index 1c7b710fc9c1..9ff825697ba3 100644
>> --- a/drivers/md/raid5.h
>> +++ b/drivers/md/raid5.h
>> @@ -221,11 +221,17 @@ struct stripe_head {
>>    	enum reconstruct_states reconstruct_state;
>>    	spinlock_t		stripe_lock;
>>    	int			cpu;
>>    	struct r5worker_group	*group;
>>    
>> -	struct stripe_head	*batch_head; /* protected by stripe lock */
>> +	/*
>> +	 * Writer protected by stripe_lock.
>> +	 * Reader hold stripe_lock when STRIPE_BATCH_READY is set.
>> +	 * Without STRIPE_BATCH_READY means no concurrent write,
>> +	 * lockless read is ok.
>> +	 */
>> +	struct stripe_head	*batch_head;
>>    	spinlock_t		batch_lock; /* only header's lock is useful */
>>    	struct list_head	batch_list; /* protected by head's batch lock*/
>>    
>>    	union {
>>    		struct r5l_io_unit	*log_io;
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-22  3:08 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-19  8:11 [PATCH v2] md/raid5: read batch_head under stripe_lock in make_stripe_request Chen Cheng
2026-06-20 22:01 ` yu kuai
2026-06-22  3:08   ` Chen Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox