* [PATCH] brd: use page reference to protect page lifetime
@ 2025-07-29 7:09 Yu Kuai
2025-07-29 7:52 ` Hou Tao
0 siblings, 1 reply; 3+ messages in thread
From: Yu Kuai @ 2025-07-29 7:09 UTC (permalink / raw)
To: hch, axboe
Cc: linux-block, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun,
johnny.chenyi
From: Yu Kuai <yukuai3@huawei.com>
As discussed [1], hold rcu for copying data from/to page is too heavy.
it's better to protect page with rcu around for page lookup and then
grab a reference to prevent page to be freed by discard.
[1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
1 file changed, 29 insertions(+), 27 deletions(-)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 0c2eabe14af3..ce311d054cab 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -44,45 +44,55 @@ struct brd_device {
};
/*
- * Look up and return a brd's page for a given sector.
+ * Look up and return a brd's page with reference grabbed for a given sector.
*/
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
- return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+ struct page *page;
+
+ rcu_read_lock();
+ page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+ if (page)
+ get_page(page);
+ rcu_read_unlock();
+
+ return page;
}
/*
* Insert a new page for a given sector, if one does not already exist.
+ * The returned page will grab reference.
*/
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
blk_opf_t opf)
- __releases(rcu)
- __acquires(rcu)
{
gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
struct page *page, *ret;
- rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
- if (!page) {
- rcu_read_lock();
+ if (!page)
return ERR_PTR(-ENOMEM);
- }
xa_lock(&brd->brd_pages);
ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
page, gfp);
- rcu_read_lock();
- if (ret) {
+ if (!ret) {
+ brd->brd_nr_pages++;
+ get_page(page);
xa_unlock(&brd->brd_pages);
- __free_page(page);
- if (xa_is_err(ret))
- return ERR_PTR(xa_err(ret));
+ return page;
+ }
+
+ if (!xa_is_err(ret)) {
+ get_page(ret);
+ xa_unlock(&brd->brd_pages);
+ put_page(page);
return ret;
}
- brd->brd_nr_pages++;
+
xa_unlock(&brd->brd_pages);
- return page;
+ put_page(page);
+ return ERR_PTR(xa_err(ret));
}
/*
@@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
pgoff_t idx;
xa_for_each(&brd->brd_pages, idx, page) {
- __free_page(page);
+ put_page(page);
cond_resched();
}
@@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
- rcu_read_lock();
page = brd_lookup_page(brd, sector);
if (!page && op_is_write(opf)) {
page = brd_insert_page(brd, sector, opf);
@@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
memset(kaddr, 0, bv.bv_len);
}
kunmap_local(kaddr);
- rcu_read_unlock();
bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ if (page)
+ put_page(page);
return true;
out_error:
- rcu_read_unlock();
if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
@@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
return false;
}
-static void brd_free_one_page(struct rcu_head *head)
-{
- struct page *page = container_of(head, struct page, rcu_head);
-
- __free_page(page);
-}
-
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
@@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- call_rcu(&page->rcu_head, brd_free_one_page);
+ put_page(page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
--
2.39.2
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] brd: use page reference to protect page lifetime
2025-07-29 7:09 [PATCH] brd: use page reference to protect page lifetime Yu Kuai
@ 2025-07-29 7:52 ` Hou Tao
2025-07-29 8:46 ` Yu Kuai
0 siblings, 1 reply; 3+ messages in thread
From: Hou Tao @ 2025-07-29 7:52 UTC (permalink / raw)
To: Yu Kuai, hch, axboe
Cc: linux-block, linux-kernel, yukuai3, yi.zhang, yangerkun,
johnny.chenyi
Hi,
On 7/29/2025 3:09 PM, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
>
> As discussed [1], hold rcu for copying data from/to page is too heavy.
> it's better to protect page with rcu around for page lookup and then
> grab a reference to prevent page to be freed by discard.
>
> [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
> drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
> 1 file changed, 29 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
> index 0c2eabe14af3..ce311d054cab 100644
> --- a/drivers/block/brd.c
> +++ b/drivers/block/brd.c
> @@ -44,45 +44,55 @@ struct brd_device {
> };
>
> /*
> - * Look up and return a brd's page for a given sector.
> + * Look up and return a brd's page with reference grabbed for a given sector.
> */
> static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
> {
> - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
> + struct page *page;
> +
> + rcu_read_lock();
> + page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
> + if (page)
> + get_page(page);
get_page_unless_zero() instead ? Er, it seems even with
get_page_unless_zero(), it is not enough, becausethe page may be reused.
Maybe we need to read the brd_pages xarray to ensure the page is still
there.
> + rcu_read_unlock();
> +
> + return page;
> }
>
> /*
> * Insert a new page for a given sector, if one does not already exist.
> + * The returned page will grab reference.
> */
> static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
> blk_opf_t opf)
> - __releases(rcu)
> - __acquires(rcu)
> {
> gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
> struct page *page, *ret;
>
> - rcu_read_unlock();
> page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
> - if (!page) {
> - rcu_read_lock();
> + if (!page)
> return ERR_PTR(-ENOMEM);
> - }
>
> xa_lock(&brd->brd_pages);
> ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
> page, gfp);
> - rcu_read_lock();
> - if (ret) {
> + if (!ret) {
> + brd->brd_nr_pages++;
> + get_page(page);
> xa_unlock(&brd->brd_pages);
If I understand correctly, the initial ref-count after alloc_page() is 1
instead of 0, so I think the get_page(page) here and get_page(ret) below
is unnecessary and it will lead to memory leak.
> - __free_page(page);
> - if (xa_is_err(ret))
> - return ERR_PTR(xa_err(ret));
> + return page;
> + }
> +
> + if (!xa_is_err(ret)) {
> + get_page(ret);
> + xa_unlock(&brd->brd_pages);
> + put_page(page);
> return ret;
> }
> - brd->brd_nr_pages++;
> +
> xa_unlock(&brd->brd_pages);
> - return page;
> + put_page(page);
> + return ERR_PTR(xa_err(ret));
> }
>
> /*
> @@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
> pgoff_t idx;
>
> xa_for_each(&brd->brd_pages, idx, page) {
> - __free_page(page);
> + put_page(page);
> cond_resched();
> }
>
> @@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>
> bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
>
> - rcu_read_lock();
> page = brd_lookup_page(brd, sector);
> if (!page && op_is_write(opf)) {
> page = brd_insert_page(brd, sector, opf);
> @@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
> memset(kaddr, 0, bv.bv_len);
> }
> kunmap_local(kaddr);
> - rcu_read_unlock();
>
> bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
> + if (page)
> + put_page(page);
> return true;
>
> out_error:
> - rcu_read_unlock();
> if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
> bio_wouldblock_error(bio);
> else
> @@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
> return false;
> }
>
> -static void brd_free_one_page(struct rcu_head *head)
> -{
> - struct page *page = container_of(head, struct page, rcu_head);
> -
> - __free_page(page);
> -}
> -
> static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
> {
> sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
> @@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
> while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
> page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
> if (page) {
> - call_rcu(&page->rcu_head, brd_free_one_page);
> + put_page(page);
> brd->brd_nr_pages--;
> }
> aligned_sector += PAGE_SECTORS;
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] brd: use page reference to protect page lifetime
2025-07-29 7:52 ` Hou Tao
@ 2025-07-29 8:46 ` Yu Kuai
0 siblings, 0 replies; 3+ messages in thread
From: Yu Kuai @ 2025-07-29 8:46 UTC (permalink / raw)
To: Hou Tao, Yu Kuai, hch, axboe
Cc: linux-block, linux-kernel, yi.zhang, yangerkun, johnny.chenyi,
yukuai (C)
Hi,
在 2025/07/29 15:52, Hou Tao 写道:
> Hi,
>
> On 7/29/2025 3:09 PM, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> As discussed [1], hold rcu for copying data from/to page is too heavy.
>> it's better to protect page with rcu around for page lookup and then
>> grab a reference to prevent page to be freed by discard.
>>
>> [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>> drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
>> 1 file changed, 29 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
>> index 0c2eabe14af3..ce311d054cab 100644
>> --- a/drivers/block/brd.c
>> +++ b/drivers/block/brd.c
>> @@ -44,45 +44,55 @@ struct brd_device {
>> };
>>
>> /*
>> - * Look up and return a brd's page for a given sector.
>> + * Look up and return a brd's page with reference grabbed for a given sector.
>> */
>> static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
>> {
>> - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
>> + struct page *page;
>> +
>> + rcu_read_lock();
>> + page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
>> + if (page)
>> + get_page(page);
>
> get_page_unless_zero() instead ? Er, it seems even with
> get_page_unless_zero(), it is not enough, becausethe page may be reused.
> Maybe we need to read the brd_pages xarray to ensure the page is still
> there.
Yes, you're right, refer to the pagecache procedures, I'll switch to use
xas_load + xas_reload in the next version.
Thanks,
Kuai
>> + rcu_read_unlock();
>> +
>> + return page;
>> }
>>
>> /*
>> * Insert a new page for a given sector, if one does not already exist.
>> + * The returned page will grab reference.
>> */
>> static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
>> blk_opf_t opf)
>> - __releases(rcu)
>> - __acquires(rcu)
>> {
>> gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
>> struct page *page, *ret;
>>
>> - rcu_read_unlock();
>> page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
>> - if (!page) {
>> - rcu_read_lock();
>> + if (!page)
>> return ERR_PTR(-ENOMEM);
>> - }
>>
>> xa_lock(&brd->brd_pages);
>> ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
>> page, gfp);
>> - rcu_read_lock();
>> - if (ret) {
>> + if (!ret) {
>> + brd->brd_nr_pages++;
>> + get_page(page);
>> xa_unlock(&brd->brd_pages);
>
> If I understand correctly, the initial ref-count after alloc_page() is 1
> instead of 0, so I think the get_page(page) here and get_page(ret) below
> is unnecessary and it will lead to memory leak.
>> - __free_page(page);
>> - if (xa_is_err(ret))
>> - return ERR_PTR(xa_err(ret));
>> + return page;
>> + }
>> +
>> + if (!xa_is_err(ret)) {
>> + get_page(ret);
>> + xa_unlock(&brd->brd_pages);
>> + put_page(page);
>> return ret;
>> }
>> - brd->brd_nr_pages++;
>> +
>> xa_unlock(&brd->brd_pages);
>> - return page;
>> + put_page(page);
>> + return ERR_PTR(xa_err(ret));
>> }
>>
>> /*
>> @@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
>> pgoff_t idx;
>>
>> xa_for_each(&brd->brd_pages, idx, page) {
>> - __free_page(page);
>> + put_page(page);
>> cond_resched();
>> }
>>
>> @@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>>
>> bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
>>
>> - rcu_read_lock();
>> page = brd_lookup_page(brd, sector);
>> if (!page && op_is_write(opf)) {
>> page = brd_insert_page(brd, sector, opf);
>> @@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>> memset(kaddr, 0, bv.bv_len);
>> }
>> kunmap_local(kaddr);
>> - rcu_read_unlock();
>>
>> bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
>> + if (page)
>> + put_page(page);
>> return true;
>>
>> out_error:
>> - rcu_read_unlock();
>> if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
>> bio_wouldblock_error(bio);
>> else
>> @@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
>> return false;
>> }
>>
>> -static void brd_free_one_page(struct rcu_head *head)
>> -{
>> - struct page *page = container_of(head, struct page, rcu_head);
>> -
>> - __free_page(page);
>> -}
>> -
>> static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
>> {
>> sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
>> @@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
>> while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
>> page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
>> if (page) {
>> - call_rcu(&page->rcu_head, brd_free_one_page);
>> + put_page(page);
>> brd->brd_nr_pages--;
>> }
>> aligned_sector += PAGE_SECTORS;
>
> .
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-07-29 8:46 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-29 7:09 [PATCH] brd: use page reference to protect page lifetime Yu Kuai
2025-07-29 7:52 ` Hou Tao
2025-07-29 8:46 ` Yu Kuai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).