* [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs
@ 2024-01-25 12:00 Gao Xiang
2024-01-25 14:12 ` Yue Hu
2024-01-26 4:56 ` Sandeep Dhavale
0 siblings, 2 replies; 4+ messages in thread
From: Gao Xiang @ 2024-01-25 12:00 UTC (permalink / raw)
To: linux-erofs; +Cc: LKML, Gao Xiang
I encountered a race issue after lengthy (~594647 sec) stress tests on
a 64k-page arm64 VM with several 4k-block EROFS images. The timing
is like below:
z_erofs_try_inplace_io z_erofs_fill_bio_vec
cmpxchg(&compressed_bvecs[].page,
NULL, ..)
[access bufvec]
compressed_bvecs[] = *bvec;
Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
other fields in bufvec didn't matter. After the subpage block support
is landed, .offset and .end can be used too, but filling bufvec isn't
an atomic operation which can cause inconsistency.
Let's use a spinlock to keep the atomicity of each bufvec. More
specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
since it's rarely used (also it takes a short time if even used) as long
as the pcluster has a reference.
Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
1 file changed, 38 insertions(+), 36 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 583c062cd0e4..c1c77166b30f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (i_blocksize(fe->inode) != PAGE_SIZE)
- return;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
-
if (page) {
t = (void *)((unsigned long)page | 1);
newpage = NULL;
@@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->obj.lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = t;
+ spin_unlock(&pcl->obj.lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->obj.lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->obj.lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
- struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *page, *oldpage;
+ struct page *page;
int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- oldpage = READ_ONCE(zbv->page);
- if (!oldpage)
+ spin_lock(&pcl->obj.lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ page = zbv.page;
+ justfound = (unsigned long)page & 1UL;
+ page = (struct page *)((unsigned long)page & ~1UL);
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ if (!page)
goto out_allocpage;
- justfound = (unsigned long)oldpage & 1UL;
- page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page;
-
DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
* Handle preallocated cached pages. We tried to allocate such pages
@@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0);
- WRITE_ONCE(zbv->page, page);
tocache = true;
goto out_tocache;
}
@@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
- if (zbv->offset < 0)
- bvec->bv_offset = round_up(-zbv->offset, bs);
- bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return;
}
@@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
/* the cached page is still in managed cache */
if (page->mapping == mc) {
- WRITE_ONCE(zbv->page, page);
/*
* The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them.
@@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
put_page(page);
out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
goto submit_bio_retry;
--
2.39.3
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs
2024-01-25 12:00 [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs Gao Xiang
@ 2024-01-25 14:12 ` Yue Hu
2024-01-26 4:56 ` Sandeep Dhavale
1 sibling, 0 replies; 4+ messages in thread
From: Yue Hu @ 2024-01-25 14:12 UTC (permalink / raw)
To: Gao Xiang; +Cc: linux-erofs, LKML
On Thu, 25 Jan 2024 20:00:39 +0800
Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
> I encountered a race issue after lengthy (~594647 sec) stress tests on
> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
> is like below:
>
> z_erofs_try_inplace_io z_erofs_fill_bio_vec
> cmpxchg(&compressed_bvecs[].page,
> NULL, ..)
> [access bufvec]
> compressed_bvecs[] = *bvec;
>
> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
> other fields in bufvec didn't matter. After the subpage block support
> is landed, .offset and .end can be used too, but filling bufvec isn't
> an atomic operation which can cause inconsistency.
>
> Let's use a spinlock to keep the atomicity of each bufvec. More
> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
> since it's rarely used (also it takes a short time if even used) as long
> as the pcluster has a reference.
>
> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
> 1 file changed, 38 insertions(+), 36 deletions(-)
>
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 583c062cd0e4..c1c77166b30f 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
> unsigned int i;
>
> - if (i_blocksize(fe->inode) != PAGE_SIZE)
> - return;
> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> return;
>
> for (i = 0; i < pclusterpages; ++i) {
> struct page *page, *newpage;
> void *t; /* mark pages just found for debugging */
>
> - /* the compressed page was loaded before */
> + /* Inaccurate check w/o locking to avoid unneeded lookups */
> if (READ_ONCE(pcl->compressed_bvecs[i].page))
> continue;
>
> page = find_get_page(mc, pcl->obj.index + i);
> -
> if (page) {
> t = (void *)((unsigned long)page | 1);
> newpage = NULL;
> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
> t = (void *)((unsigned long)newpage | 1);
> }
> -
> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
> + spin_lock(&pcl->obj.lockref.lock);
> + if (!pcl->compressed_bvecs[i].page) {
> + pcl->compressed_bvecs[i].page = t;
> + spin_unlock(&pcl->obj.lockref.lock);
> continue;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
>
> if (page)
> put_page(page);
> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
> return 0;
> }
>
> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
> - struct z_erofs_bvec *bvec)
> -{
> - struct z_erofs_pcluster *const pcl = fe->pcl;
> -
> - while (fe->icur > 0) {
> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
> - NULL, bvec->page)) {
> - pcl->compressed_bvecs[fe->icur] = *bvec;
> - return true;
> - }
> - }
> - return false;
> -}
> -
> /* callers must be with pcluster lock held */
> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
> struct z_erofs_bvec *bvec, bool exclusive)
> {
> + struct z_erofs_pcluster *pcl = fe->pcl;
> int ret;
>
> if (exclusive) {
> /* give priority for inplaceio to use file pages first */
> - if (z_erofs_try_inplace_io(fe, bvec))
> + spin_lock(&pcl->obj.lockref.lock);
> + while (fe->icur > 0) {
> + if (pcl->compressed_bvecs[--fe->icur].page)
> + continue;
> + pcl->compressed_bvecs[fe->icur] = *bvec;
> + spin_unlock(&pcl->obj.lockref.lock);
> return 0;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
> +
> /* otherwise, check if it can be used as a bvpage */
> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
> !fe->candidate_bvpage)
> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> {
> gfp_t gfp = mapping_gfp_mask(mc);
> bool tocache = false;
> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
> + struct z_erofs_bvec zbv;
> struct address_space *mapping;
> - struct page *page, *oldpage;
> + struct page *page;
> int justfound, bs = i_blocksize(f->inode);
>
> /* Except for inplace pages, the entire page can be used for I/Os */
> bvec->bv_offset = 0;
> bvec->bv_len = PAGE_SIZE;
> repeat:
> - oldpage = READ_ONCE(zbv->page);
> - if (!oldpage)
> + spin_lock(&pcl->obj.lockref.lock);
> + zbv = pcl->compressed_bvecs[nr];
> + page = zbv.page;
> + justfound = (unsigned long)page & 1UL;
> + page = (struct page *)((unsigned long)page & ~1UL);
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> + if (!page)
> goto out_allocpage;
>
> - justfound = (unsigned long)oldpage & 1UL;
> - page = (struct page *)((unsigned long)oldpage & ~1UL);
> bvec->bv_page = page;
> -
> DBG_BUGON(z_erofs_is_shortlived_page(page));
> /*
> * Handle preallocated cached pages. We tried to allocate such pages
> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> */
> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
> set_page_private(page, 0);
> - WRITE_ONCE(zbv->page, page);
> tocache = true;
> goto out_tocache;
> }
> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> * therefore it is impossible for `mapping` to be NULL.
> */
> if (mapping && mapping != mc) {
> - if (zbv->offset < 0)
> - bvec->bv_offset = round_up(-zbv->offset, bs);
> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
> + if (zbv.offset < 0)
> + bvec->bv_offset = round_up(-zbv.offset, bs);
> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
> return;
> }
>
> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>
> /* the cached page is still in managed cache */
> if (page->mapping == mc) {
> - WRITE_ONCE(zbv->page, page);
> /*
> * The cached page is still available but without a valid
> * `->private` pcluster hint. Let's reconnect them.
> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> put_page(page);
> out_allocpage:
> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
> + spin_lock(&pcl->obj.lockref.lock);
> + if (pcl->compressed_bvecs[nr].page) {
> erofs_pagepool_add(&f->pagepool, page);
> + spin_unlock(&pcl->obj.lockref.lock);
> cond_resched();
> goto repeat;
> }
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> bvec->bv_page = page;
> out_tocache:
> if (!tocache || bs != PAGE_SIZE ||
> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>
> if (cur + bvec.bv_len > end)
> bvec.bv_len = end - cur;
> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
> bvec.bv_offset))
> goto submit_bio_retry;
Looks good to me.
Reviewed-by: Yue Hu <huyue2@coolpad.com>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs
2024-01-25 12:00 [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs Gao Xiang
2024-01-25 14:12 ` Yue Hu
@ 2024-01-26 4:56 ` Sandeep Dhavale
2024-01-26 5:06 ` Gao Xiang
1 sibling, 1 reply; 4+ messages in thread
From: Sandeep Dhavale @ 2024-01-26 4:56 UTC (permalink / raw)
To: Gao Xiang; +Cc: linux-erofs, LKML, kernel-team
On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>
> I encountered a race issue after lengthy (~594647 sec) stress tests on
> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
> is like below:
>
> z_erofs_try_inplace_io z_erofs_fill_bio_vec
> cmpxchg(&compressed_bvecs[].page,
> NULL, ..)
> [access bufvec]
> compressed_bvecs[] = *bvec;
>
> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
> other fields in bufvec didn't matter. After the subpage block support
> is landed, .offset and .end can be used too, but filling bufvec isn't
> an atomic operation which can cause inconsistency.
>
> Let's use a spinlock to keep the atomicity of each bufvec. More
> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
> since it's rarely used (also it takes a short time if even used) as long
> as the pcluster has a reference.
>
> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
> 1 file changed, 38 insertions(+), 36 deletions(-)
>
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 583c062cd0e4..c1c77166b30f 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
> unsigned int i;
>
> - if (i_blocksize(fe->inode) != PAGE_SIZE)
> - return;
> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> return;
>
> for (i = 0; i < pclusterpages; ++i) {
> struct page *page, *newpage;
> void *t; /* mark pages just found for debugging */
>
> - /* the compressed page was loaded before */
> + /* Inaccurate check w/o locking to avoid unneeded lookups */
> if (READ_ONCE(pcl->compressed_bvecs[i].page))
> continue;
>
> page = find_get_page(mc, pcl->obj.index + i);
> -
> if (page) {
> t = (void *)((unsigned long)page | 1);
> newpage = NULL;
> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
> t = (void *)((unsigned long)newpage | 1);
> }
> -
> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
> + spin_lock(&pcl->obj.lockref.lock);
> + if (!pcl->compressed_bvecs[i].page) {
> + pcl->compressed_bvecs[i].page = t;
> + spin_unlock(&pcl->obj.lockref.lock);
> continue;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
>
> if (page)
> put_page(page);
> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
> return 0;
> }
>
> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
> - struct z_erofs_bvec *bvec)
> -{
> - struct z_erofs_pcluster *const pcl = fe->pcl;
> -
> - while (fe->icur > 0) {
> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
> - NULL, bvec->page)) {
> - pcl->compressed_bvecs[fe->icur] = *bvec;
> - return true;
> - }
> - }
> - return false;
> -}
> -
> /* callers must be with pcluster lock held */
> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
> struct z_erofs_bvec *bvec, bool exclusive)
> {
> + struct z_erofs_pcluster *pcl = fe->pcl;
> int ret;
>
> if (exclusive) {
> /* give priority for inplaceio to use file pages first */
> - if (z_erofs_try_inplace_io(fe, bvec))
> + spin_lock(&pcl->obj.lockref.lock);
> + while (fe->icur > 0) {
> + if (pcl->compressed_bvecs[--fe->icur].page)
> + continue;
> + pcl->compressed_bvecs[fe->icur] = *bvec;
> + spin_unlock(&pcl->obj.lockref.lock);
> return 0;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
> +
> /* otherwise, check if it can be used as a bvpage */
> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
> !fe->candidate_bvpage)
> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> {
> gfp_t gfp = mapping_gfp_mask(mc);
> bool tocache = false;
> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
> + struct z_erofs_bvec zbv;
> struct address_space *mapping;
> - struct page *page, *oldpage;
> + struct page *page;
> int justfound, bs = i_blocksize(f->inode);
>
> /* Except for inplace pages, the entire page can be used for I/Os */
> bvec->bv_offset = 0;
> bvec->bv_len = PAGE_SIZE;
> repeat:
> - oldpage = READ_ONCE(zbv->page);
> - if (!oldpage)
> + spin_lock(&pcl->obj.lockref.lock);
> + zbv = pcl->compressed_bvecs[nr];
> + page = zbv.page;
> + justfound = (unsigned long)page & 1UL;
> + page = (struct page *)((unsigned long)page & ~1UL);
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> + if (!page)
> goto out_allocpage;
>
> - justfound = (unsigned long)oldpage & 1UL;
> - page = (struct page *)((unsigned long)oldpage & ~1UL);
> bvec->bv_page = page;
> -
> DBG_BUGON(z_erofs_is_shortlived_page(page));
> /*
> * Handle preallocated cached pages. We tried to allocate such pages
> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> */
> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
> set_page_private(page, 0);
> - WRITE_ONCE(zbv->page, page);
> tocache = true;
> goto out_tocache;
> }
> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> * therefore it is impossible for `mapping` to be NULL.
> */
> if (mapping && mapping != mc) {
> - if (zbv->offset < 0)
> - bvec->bv_offset = round_up(-zbv->offset, bs);
> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
> + if (zbv.offset < 0)
> + bvec->bv_offset = round_up(-zbv.offset, bs);
> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
> return;
> }
>
> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>
> /* the cached page is still in managed cache */
> if (page->mapping == mc) {
> - WRITE_ONCE(zbv->page, page);
> /*
> * The cached page is still available but without a valid
> * `->private` pcluster hint. Let's reconnect them.
> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> put_page(page);
> out_allocpage:
> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
> + spin_lock(&pcl->obj.lockref.lock);
> + if (pcl->compressed_bvecs[nr].page) {
> erofs_pagepool_add(&f->pagepool, page);
> + spin_unlock(&pcl->obj.lockref.lock);
> cond_resched();
> goto repeat;
> }
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> bvec->bv_page = page;
> out_tocache:
> if (!tocache || bs != PAGE_SIZE ||
> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>
> if (cur + bvec.bv_len > end)
> bvec.bv_len = end - cur;
> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
> bvec.bv_offset))
> goto submit_bio_retry;
> --
> 2.39.3
>
LGTM!
Reviewed-by: Sandeep Dhavale <dhavale@google.com>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs
2024-01-26 4:56 ` Sandeep Dhavale
@ 2024-01-26 5:06 ` Gao Xiang
0 siblings, 0 replies; 4+ messages in thread
From: Gao Xiang @ 2024-01-26 5:06 UTC (permalink / raw)
To: Sandeep Dhavale; +Cc: kernel-team, linux-erofs, LKML
On 2024/1/26 12:56, Sandeep Dhavale via Linux-erofs wrote:
> On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>>
>> I encountered a race issue after lengthy (~594647 sec) stress tests on
>> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
>> is like below:
>>
>> z_erofs_try_inplace_io z_erofs_fill_bio_vec
>> cmpxchg(&compressed_bvecs[].page,
>> NULL, ..)
>> [access bufvec]
>> compressed_bvecs[] = *bvec;
>>
>> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
>> other fields in bufvec didn't matter. After the subpage block support
>> is landed, .offset and .end can be used too, but filling bufvec isn't
>> an atomic operation which can cause inconsistency.
>>
>> Let's use a spinlock to keep the atomicity of each bufvec. More
>> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
>> since it's rarely used (also it takes a short time if even used) as long
>> as the pcluster has a reference.
>>
>> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
>> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
>> ---
>> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
>> 1 file changed, 38 insertions(+), 36 deletions(-)
>>
>> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
>> index 583c062cd0e4..c1c77166b30f 100644
>> --- a/fs/erofs/zdata.c
>> +++ b/fs/erofs/zdata.c
>> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
>> unsigned int i;
>>
>> - if (i_blocksize(fe->inode) != PAGE_SIZE)
>> - return;
>> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
>> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>> return;
>>
>> for (i = 0; i < pclusterpages; ++i) {
>> struct page *page, *newpage;
>> void *t; /* mark pages just found for debugging */
>>
>> - /* the compressed page was loaded before */
>> + /* Inaccurate check w/o locking to avoid unneeded lookups */
>> if (READ_ONCE(pcl->compressed_bvecs[i].page))
>> continue;
>>
>> page = find_get_page(mc, pcl->obj.index + i);
>> -
>> if (page) {
>> t = (void *)((unsigned long)page | 1);
>> newpage = NULL;
>> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
>> t = (void *)((unsigned long)newpage | 1);
>> }
>> -
>> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
>> + spin_lock(&pcl->obj.lockref.lock);
>> + if (!pcl->compressed_bvecs[i].page) {
>> + pcl->compressed_bvecs[i].page = t;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> continue;
>> + }
>> + spin_unlock(&pcl->obj.lockref.lock);
>>
>> if (page)
>> put_page(page);
>> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
>> return 0;
>> }
>>
>> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
>> - struct z_erofs_bvec *bvec)
>> -{
>> - struct z_erofs_pcluster *const pcl = fe->pcl;
>> -
>> - while (fe->icur > 0) {
>> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
>> - NULL, bvec->page)) {
>> - pcl->compressed_bvecs[fe->icur] = *bvec;
>> - return true;
>> - }
>> - }
>> - return false;
>> -}
>> -
>> /* callers must be with pcluster lock held */
>> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
>> struct z_erofs_bvec *bvec, bool exclusive)
>> {
>> + struct z_erofs_pcluster *pcl = fe->pcl;
>> int ret;
>>
>> if (exclusive) {
>> /* give priority for inplaceio to use file pages first */
>> - if (z_erofs_try_inplace_io(fe, bvec))
>> + spin_lock(&pcl->obj.lockref.lock);
>> + while (fe->icur > 0) {
>> + if (pcl->compressed_bvecs[--fe->icur].page)
>> + continue;
>> + pcl->compressed_bvecs[fe->icur] = *bvec;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> return 0;
>> + }
>> + spin_unlock(&pcl->obj.lockref.lock);
>> +
>> /* otherwise, check if it can be used as a bvpage */
>> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
>> !fe->candidate_bvpage)
>> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> {
>> gfp_t gfp = mapping_gfp_mask(mc);
>> bool tocache = false;
>> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
>> + struct z_erofs_bvec zbv;
>> struct address_space *mapping;
>> - struct page *page, *oldpage;
>> + struct page *page;
>> int justfound, bs = i_blocksize(f->inode);
>>
>> /* Except for inplace pages, the entire page can be used for I/Os */
>> bvec->bv_offset = 0;
>> bvec->bv_len = PAGE_SIZE;
>> repeat:
>> - oldpage = READ_ONCE(zbv->page);
>> - if (!oldpage)
>> + spin_lock(&pcl->obj.lockref.lock);
>> + zbv = pcl->compressed_bvecs[nr];
>> + page = zbv.page;
>> + justfound = (unsigned long)page & 1UL;
>> + page = (struct page *)((unsigned long)page & ~1UL);
>> + pcl->compressed_bvecs[nr].page = page;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> + if (!page)
>> goto out_allocpage;
>>
>> - justfound = (unsigned long)oldpage & 1UL;
>> - page = (struct page *)((unsigned long)oldpage & ~1UL);
>> bvec->bv_page = page;
>> -
>> DBG_BUGON(z_erofs_is_shortlived_page(page));
>> /*
>> * Handle preallocated cached pages. We tried to allocate such pages
>> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> */
>> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
>> set_page_private(page, 0);
>> - WRITE_ONCE(zbv->page, page);
>> tocache = true;
>> goto out_tocache;
>> }
>> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> * therefore it is impossible for `mapping` to be NULL.
>> */
>> if (mapping && mapping != mc) {
>> - if (zbv->offset < 0)
>> - bvec->bv_offset = round_up(-zbv->offset, bs);
>> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
>> + if (zbv.offset < 0)
>> + bvec->bv_offset = round_up(-zbv.offset, bs);
>> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
>> return;
>> }
>>
>> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>>
>> /* the cached page is still in managed cache */
>> if (page->mapping == mc) {
>> - WRITE_ONCE(zbv->page, page);
>> /*
>> * The cached page is still available but without a valid
>> * `->private` pcluster hint. Let's reconnect them.
>> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> put_page(page);
>> out_allocpage:
>> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
>> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
>> + spin_lock(&pcl->obj.lockref.lock);
>> + if (pcl->compressed_bvecs[nr].page) {
>> erofs_pagepool_add(&f->pagepool, page);
>> + spin_unlock(&pcl->obj.lockref.lock);
>> cond_resched();
>> goto repeat;
>> }
>> + pcl->compressed_bvecs[nr].page = page;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> bvec->bv_page = page;
>> out_tocache:
>> if (!tocache || bs != PAGE_SIZE ||
>> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>>
>> if (cur + bvec.bv_len > end)
>> bvec.bv_len = end - cur;
>> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
>> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
>> bvec.bv_offset))
>> goto submit_bio_retry;
>> --
>> 2.39.3
>>
>
> LGTM!
>
> Reviewed-by: Sandeep Dhavale <dhavale@google.com>
Thanks for the review :-)
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-01-26 5:06 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-25 12:00 [PATCH] erofs: fix infinite loop due to a race of filling compressed_bvecs Gao Xiang
2024-01-25 14:12 ` Yue Hu
2024-01-26 4:56 ` Sandeep Dhavale
2024-01-26 5:06 ` Gao Xiang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox