linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] brd: use page reference to protect page lifetime
@ 2025-07-29  7:09 Yu Kuai
  2025-07-29  7:52 ` Hou Tao
  0 siblings, 1 reply; 3+ messages in thread
From: Yu Kuai @ 2025-07-29  7:09 UTC (permalink / raw)
  To: hch, axboe
  Cc: linux-block, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun,
	johnny.chenyi

From: Yu Kuai <yukuai3@huawei.com>

As discussed [1], hold rcu for copying data from/to page is too heavy.
it's better to protect page with rcu around for page lookup and then
grab a reference to prevent page to be freed by discard.

[1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 0c2eabe14af3..ce311d054cab 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -44,45 +44,55 @@ struct brd_device {
 };
 
 /*
- * Look up and return a brd's page for a given sector.
+ * Look up and return a brd's page with reference grabbed for a given sector.
  */
 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 {
-	return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+	struct page *page;
+
+	rcu_read_lock();
+	page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+	if (page)
+		get_page(page);
+	rcu_read_unlock();
+
+	return page;
 }
 
 /*
  * Insert a new page for a given sector, if one does not already exist.
+ * The returned page will grab reference.
  */
 static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
 		blk_opf_t opf)
-	__releases(rcu)
-	__acquires(rcu)
 {
 	gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
 	struct page *page, *ret;
 
-	rcu_read_unlock();
 	page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
-	if (!page) {
-		rcu_read_lock();
+	if (!page)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	xa_lock(&brd->brd_pages);
 	ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
 			page, gfp);
-	rcu_read_lock();
-	if (ret) {
+	if (!ret) {
+		brd->brd_nr_pages++;
+		get_page(page);
 		xa_unlock(&brd->brd_pages);
-		__free_page(page);
-		if (xa_is_err(ret))
-			return ERR_PTR(xa_err(ret));
+		return page;
+	}
+
+	if (!xa_is_err(ret)) {
+		get_page(ret);
+		xa_unlock(&brd->brd_pages);
+		put_page(page);
 		return ret;
 	}
-	brd->brd_nr_pages++;
+
 	xa_unlock(&brd->brd_pages);
-	return page;
+	put_page(page);
+	return ERR_PTR(xa_err(ret));
 }
 
 /*
@@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
 	pgoff_t idx;
 
 	xa_for_each(&brd->brd_pages, idx, page) {
-		__free_page(page);
+		put_page(page);
 		cond_resched();
 	}
 
@@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
 
 	bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
 
-	rcu_read_lock();
 	page = brd_lookup_page(brd, sector);
 	if (!page && op_is_write(opf)) {
 		page = brd_insert_page(brd, sector, opf);
@@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
 			memset(kaddr, 0, bv.bv_len);
 	}
 	kunmap_local(kaddr);
-	rcu_read_unlock();
 
 	bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+	if (page)
+		put_page(page);
 	return true;
 
 out_error:
-	rcu_read_unlock();
 	if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
 		bio_wouldblock_error(bio);
 	else
@@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
 	return false;
 }
 
-static void brd_free_one_page(struct rcu_head *head)
-{
-	struct page *page = container_of(head, struct page, rcu_head);
-
-	__free_page(page);
-}
-
 static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
 {
 	sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
@@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
 	while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
 		page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
 		if (page) {
-			call_rcu(&page->rcu_head, brd_free_one_page);
+			put_page(page);
 			brd->brd_nr_pages--;
 		}
 		aligned_sector += PAGE_SECTORS;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] brd: use page reference to protect page lifetime
  2025-07-29  7:09 [PATCH] brd: use page reference to protect page lifetime Yu Kuai
@ 2025-07-29  7:52 ` Hou Tao
  2025-07-29  8:46   ` Yu Kuai
  0 siblings, 1 reply; 3+ messages in thread
From: Hou Tao @ 2025-07-29  7:52 UTC (permalink / raw)
  To: Yu Kuai, hch, axboe
  Cc: linux-block, linux-kernel, yukuai3, yi.zhang, yangerkun,
	johnny.chenyi

Hi,

On 7/29/2025 3:09 PM, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
>
> As discussed [1], hold rcu for copying data from/to page is too heavy.
> it's better to protect page with rcu around for page lookup and then
> grab a reference to prevent page to be freed by discard.
>
> [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
>  1 file changed, 29 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
> index 0c2eabe14af3..ce311d054cab 100644
> --- a/drivers/block/brd.c
> +++ b/drivers/block/brd.c
> @@ -44,45 +44,55 @@ struct brd_device {
>  };
>  
>  /*
> - * Look up and return a brd's page for a given sector.
> + * Look up and return a brd's page with reference grabbed for a given sector.
>   */
>  static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
>  {
> -	return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
> +	struct page *page;
> +
> +	rcu_read_lock();
> +	page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
> +	if (page)
> +		get_page(page);

get_page_unless_zero() instead ? Er, it seems even with
get_page_unless_zero(), it is not enough, becausethe page may be reused.
Maybe we need to read the brd_pages xarray to ensure the page is still
there.
> +	rcu_read_unlock();
> +
> +	return page;
>  }
>  
>  /*
>   * Insert a new page for a given sector, if one does not already exist.
> + * The returned page will grab reference.
>   */
>  static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
>  		blk_opf_t opf)
> -	__releases(rcu)
> -	__acquires(rcu)
>  {
>  	gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
>  	struct page *page, *ret;
>  
> -	rcu_read_unlock();
>  	page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
> -	if (!page) {
> -		rcu_read_lock();
> +	if (!page)
>  		return ERR_PTR(-ENOMEM);
> -	}
>  
>  	xa_lock(&brd->brd_pages);
>  	ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
>  			page, gfp);
> -	rcu_read_lock();
> -	if (ret) {
> +	if (!ret) {
> +		brd->brd_nr_pages++;
> +		get_page(page);
>  		xa_unlock(&brd->brd_pages);

If I understand correctly, the initial ref-count after alloc_page() is 1
instead of 0, so I think the get_page(page) here and get_page(ret) below
is unnecessary and it will lead to memory leak.
> -		__free_page(page);
> -		if (xa_is_err(ret))
> -			return ERR_PTR(xa_err(ret));
> +		return page;
> +	}
> +
> +	if (!xa_is_err(ret)) {
> +		get_page(ret);
> +		xa_unlock(&brd->brd_pages);
> +		put_page(page);
>  		return ret;
>  	}
> -	brd->brd_nr_pages++;
> +
>  	xa_unlock(&brd->brd_pages);
> -	return page;
> +	put_page(page);
> +	return ERR_PTR(xa_err(ret));
>  }
>  
>  /*
> @@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
>  	pgoff_t idx;
>  
>  	xa_for_each(&brd->brd_pages, idx, page) {
> -		__free_page(page);
> +		put_page(page);
>  		cond_resched();
>  	}
>  
> @@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>  
>  	bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
>  
> -	rcu_read_lock();
>  	page = brd_lookup_page(brd, sector);
>  	if (!page && op_is_write(opf)) {
>  		page = brd_insert_page(brd, sector, opf);
> @@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>  			memset(kaddr, 0, bv.bv_len);
>  	}
>  	kunmap_local(kaddr);
> -	rcu_read_unlock();
>  
>  	bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
> +	if (page)
> +		put_page(page);
>  	return true;
>  
>  out_error:
> -	rcu_read_unlock();
>  	if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
>  		bio_wouldblock_error(bio);
>  	else
> @@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>  	return false;
>  }
>  
> -static void brd_free_one_page(struct rcu_head *head)
> -{
> -	struct page *page = container_of(head, struct page, rcu_head);
> -
> -	__free_page(page);
> -}
> -
>  static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
>  {
>  	sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
> @@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
>  	while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
>  		page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
>  		if (page) {
> -			call_rcu(&page->rcu_head, brd_free_one_page);
> +			put_page(page);
>  			brd->brd_nr_pages--;
>  		}
>  		aligned_sector += PAGE_SECTORS;


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] brd: use page reference to protect page lifetime
  2025-07-29  7:52 ` Hou Tao
@ 2025-07-29  8:46   ` Yu Kuai
  0 siblings, 0 replies; 3+ messages in thread
From: Yu Kuai @ 2025-07-29  8:46 UTC (permalink / raw)
  To: Hou Tao, Yu Kuai, hch, axboe
  Cc: linux-block, linux-kernel, yi.zhang, yangerkun, johnny.chenyi,
	yukuai (C)

Hi,

在 2025/07/29 15:52, Hou Tao 写道:
> Hi,
> 
> On 7/29/2025 3:09 PM, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> As discussed [1], hold rcu for copying data from/to page is too heavy.
>> it's better to protect page with rcu around for page lookup and then
>> grab a reference to prevent page to be freed by discard.
>>
>> [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
>>   1 file changed, 29 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
>> index 0c2eabe14af3..ce311d054cab 100644
>> --- a/drivers/block/brd.c
>> +++ b/drivers/block/brd.c
>> @@ -44,45 +44,55 @@ struct brd_device {
>>   };
>>   
>>   /*
>> - * Look up and return a brd's page for a given sector.
>> + * Look up and return a brd's page with reference grabbed for a given sector.
>>    */
>>   static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
>>   {
>> -	return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
>> +	struct page *page;
>> +
>> +	rcu_read_lock();
>> +	page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
>> +	if (page)
>> +		get_page(page);
> 
> get_page_unless_zero() instead ? Er, it seems even with
> get_page_unless_zero(), it is not enough, becausethe page may be reused.
> Maybe we need to read the brd_pages xarray to ensure the page is still
> there.

Yes, you're right, refer to the pagecache procedures, I'll switch to use
xas_load + xas_reload in the next version.

Thanks,
Kuai

>> +	rcu_read_unlock();
>> +
>> +	return page;
>>   }
>>   
>>   /*
>>    * Insert a new page for a given sector, if one does not already exist.
>> + * The returned page will grab reference.
>>    */
>>   static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
>>   		blk_opf_t opf)
>> -	__releases(rcu)
>> -	__acquires(rcu)
>>   {
>>   	gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
>>   	struct page *page, *ret;
>>   
>> -	rcu_read_unlock();
>>   	page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
>> -	if (!page) {
>> -		rcu_read_lock();
>> +	if (!page)
>>   		return ERR_PTR(-ENOMEM);
>> -	}
>>   
>>   	xa_lock(&brd->brd_pages);
>>   	ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
>>   			page, gfp);
>> -	rcu_read_lock();
>> -	if (ret) {
>> +	if (!ret) {
>> +		brd->brd_nr_pages++;
>> +		get_page(page);
>>   		xa_unlock(&brd->brd_pages);
> 
> If I understand correctly, the initial ref-count after alloc_page() is 1
> instead of 0, so I think the get_page(page) here and get_page(ret) below
> is unnecessary and it will lead to memory leak.
>> -		__free_page(page);
>> -		if (xa_is_err(ret))
>> -			return ERR_PTR(xa_err(ret));
>> +		return page;
>> +	}
>> +
>> +	if (!xa_is_err(ret)) {
>> +		get_page(ret);
>> +		xa_unlock(&brd->brd_pages);
>> +		put_page(page);
>>   		return ret;
>>   	}
>> -	brd->brd_nr_pages++;
>> +
>>   	xa_unlock(&brd->brd_pages);
>> -	return page;
>> +	put_page(page);
>> +	return ERR_PTR(xa_err(ret));
>>   }
>>   
>>   /*
>> @@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
>>   	pgoff_t idx;
>>   
>>   	xa_for_each(&brd->brd_pages, idx, page) {
>> -		__free_page(page);
>> +		put_page(page);
>>   		cond_resched();
>>   	}
>>   
>> @@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>>   
>>   	bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
>>   
>> -	rcu_read_lock();
>>   	page = brd_lookup_page(brd, sector);
>>   	if (!page && op_is_write(opf)) {
>>   		page = brd_insert_page(brd, sector, opf);
>> @@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>>   			memset(kaddr, 0, bv.bv_len);
>>   	}
>>   	kunmap_local(kaddr);
>> -	rcu_read_unlock();
>>   
>>   	bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
>> +	if (page)
>> +		put_page(page);
>>   	return true;
>>   
>>   out_error:
>> -	rcu_read_unlock();
>>   	if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
>>   		bio_wouldblock_error(bio);
>>   	else
>> @@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>>   	return false;
>>   }
>>   
>> -static void brd_free_one_page(struct rcu_head *head)
>> -{
>> -	struct page *page = container_of(head, struct page, rcu_head);
>> -
>> -	__free_page(page);
>> -}
>> -
>>   static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
>>   {
>>   	sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
>> @@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
>>   	while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
>>   		page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
>>   		if (page) {
>> -			call_rcu(&page->rcu_head, brd_free_one_page);
>> +			put_page(page);
>>   			brd->brd_nr_pages--;
>>   		}
>>   		aligned_sector += PAGE_SECTORS;
> 
> .
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-07-29  8:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-29  7:09 [PATCH] brd: use page reference to protect page lifetime Yu Kuai
2025-07-29  7:52 ` Hou Tao
2025-07-29  8:46   ` Yu Kuai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).