Re: parts of pages on NFS being replaced by swaths of NULs

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mike Snitzer <snitzer@kernel.org>
To: Trond Myklebust <trondmy@kernel.org>
Cc: Jeff Layton <jlayton@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	linux-nfs@vger.kernel.org, jonathan.flynn@hammerspace.com
Subject: Re: parts of pages on NFS being replaced by swaths of NULs
Date: Tue, 19 Aug 2025 14:38:42 -0400	[thread overview]
Message-ID: <aKTEsmOAXsHEGEEz@kernel.org> (raw)
In-Reply-To: <aKD3SFcyCnb8snst@kernel.org>

On Sat, Aug 16, 2025 at 05:25:28PM -0400, Mike Snitzer wrote:
> On Sat, Aug 16, 2025 at 07:51:17AM -0700, Trond Myklebust wrote:
> > 
> > That's not sufficient AFAICS. Does the following patch work?
> > 
> > 8<------------------------------------------------------------
> > From fc9690dda01f001c6cd11665701394da8ebba1ab Mon Sep 17 00:00:00 2001
> > Message-ID: <fc9690dda01f001c6cd11665701394da8ebba1ab.1755355810.git.trond.myklebust@hammerspace.com>
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > Date: Sat, 16 Aug 2025 07:25:20 -0700
> > Subject: [PATCH] NFS: Fix a race when updating an existing write
> > 
> > After nfs_lock_and_join_requests() tests for whether the request is
> > still attached to the mapping, nothing prevents a call to
> > nfs_inode_remove_request() from succeeding until we actually lock the
> > page group.
> > The reason is that whoever called nfs_inode_remove_request() doesn't
> > necessarily have a lock on the page group head.
> > 
> > So in order to avoid races, let's take the page group lock earlier in
> > nfs_lock_and_join_requests(), and hold it across the removal of the
> > request in nfs_inode_remove_request().
> > 
> > Reported-by: Jeff Layton <jlayton@kernel.org>
> > Fixes: c3f2235782c3 ("nfs: fold nfs_folio_find_and_lock_request into nfs_lock_and_join_requests")
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  fs/nfs/pagelist.c        |  9 +++++----
> >  fs/nfs/write.c           | 29 ++++++++++-------------------
> >  include/linux/nfs_page.h |  1 +
> >  3 files changed, 16 insertions(+), 23 deletions(-)
> > 
> > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> > index 11968dcb7243..6e69ce43a13f 100644
> > --- a/fs/nfs/pagelist.c
> > +++ b/fs/nfs/pagelist.c
> > @@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req)
> >  	nfs_page_clear_headlock(req);
> >  }
> >  
> > -/*
> > - * nfs_page_group_sync_on_bit_locked
> > +/**
> > + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set
> > + * @req: request in page group
> > + * @bit: PG_* bit that is used to sync page group
> >   *
> >   * must be called with page group lock held
> >   */
> > -static bool
> > -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
> > +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
> >  {
> >  	struct nfs_page *head = req->wb_head;
> >  	struct nfs_page *tmp;
> > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > index fa5c41d0989a..8b7c04737967 100644
> > --- a/fs/nfs/write.c
> > +++ b/fs/nfs/write.c
> > @@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
> >  	}
> >  }
> >  
> > -static int
> > -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
> > +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
> >  {
> > -	int ret;
> > -
> > -	if (!test_bit(PG_REMOVE, &req->wb_flags))
> > -		return 0;
> > -	ret = nfs_page_group_lock(req);
> > -	if (ret)
> > -		return ret;
> >  	if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
> >  		nfs_page_set_inode_ref(req, inode);
> > -	nfs_page_group_unlock(req);
> > -	return 0;
> >  }
> >  
> >  /**
> > @@ -585,19 +575,18 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
> >  		}
> >  	}
> >  
> > +	ret = nfs_page_group_lock(head);
> > +	if (ret < 0)
> > +		goto out_unlock;
> > +
> >  	/* Ensure that nobody removed the request before we locked it */
> >  	if (head != folio->private) {
> > +		nfs_page_group_unlock(head);
> >  		nfs_unlock_and_release_request(head);
> >  		goto retry;
> >  	}
> >  
> > -	ret = nfs_cancel_remove_inode(head, inode);
> > -	if (ret < 0)
> > -		goto out_unlock;
> > -
> > -	ret = nfs_page_group_lock(head);
> > -	if (ret < 0)
> > -		goto out_unlock;
> > +	nfs_cancel_remove_inode(head, inode);
> >  
> >  	/* lock each request in the page group */
> >  	for (subreq = head->wb_this_page;
> > @@ -786,7 +775,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
> >  {
> >  	struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
> >  
> > -	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
> > +	nfs_page_group_lock(req);
> > +	if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) {
> >  		struct folio *folio = nfs_page_to_folio(req->wb_head);
> >  		struct address_space *mapping = folio->mapping;
> >  
> > @@ -798,6 +788,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
> >  		}
> >  		spin_unlock(&mapping->i_private_lock);
> >  	}
> > +	nfs_page_group_unlock(req);
> >  
> >  	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
> >  		atomic_long_dec(&nfsi->nrequests);
> > diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> > index 169b4ae30ff4..9aed39abc94b 100644
> > --- a/include/linux/nfs_page.h
> > +++ b/include/linux/nfs_page.h
> > @@ -160,6 +160,7 @@ extern void nfs_join_page_group(struct nfs_page *head,
> >  extern int nfs_page_group_lock(struct nfs_page *);
> >  extern void nfs_page_group_unlock(struct nfs_page *);
> >  extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
> > +extern bool nfs_page_group_sync_on_bit_locked(struct nfs_page *, unsigned int);
> >  extern	int nfs_page_set_headlock(struct nfs_page *req);
> >  extern void nfs_page_clear_headlock(struct nfs_page *req);
> >  extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
> > -- 
> > 2.50.1
> > 
> > 
> 
> 
> Trond,
> 
> You're the best! ;)
> 
> Your patch fixes corruption I've been chasing for the past week
> relative to NFS DIRECT, specifically with:
> echo Y > /sys/module/nfs/parameters/localio_O_DIRECT_align_misaligned_IO
> 
> So you need my latest NFS DIRECT patchset:
> https://lore.kernel.org/linux-nfs/20250815233003.55071-1-snitzer@kernel.org/
> 
> With it, writes would be corrupted when using the attached reproducer
> (from Jon Flynn, with the assistance of ChatGPT) that pulls out the
> subset of MLperf unet3d test (when ran in buffered IO mode, so
> entirely misaligned relative to DIO-alignment requirements) that we've
> seen npz CRC compare failure with.
> 
> I tested my patchset with your patch applied and it all "just works".
> 
> Ship it all!
> 
> Thanks,
> Mike
> 
> ps. running the attached reproducer is as simple as:
> ./mlperf_npz_tool.py --npz-path /mnt/share1/sample_a.npz

So even with this patch there is something still causing data
corruption on much faster systems.  This mlperf_npz_tool.py reproducer
works fine on a slower VMware based testbed, but once I load the same
kernel (with Trond's fix applied) on a very fast baremetal system I
still see CRC mismatch due to write corruption if
localio_O_DIRECT_align_misaligned_IO=Y (from my NFS DIRECT patchset).

Again, both of my latest NFSD DIRECT and NFS DIRECT patchsets are
applied:
https://lore.kernel.org/linux-nfs/20250815144607.50967-1-snitzer@kernel.org/
https://lore.kernel.org/linux-nfs/20250815233003.55071-1-snitzer@kernel.org/

And I'm hacking nfs4 to impose O_DIRECT on buffered IO (reproducer
doesn't open its files with O_DIRECT but NFS client acts like it did).
I can dig out the patch to achieve that if others are interested.

Mike

next prev parent reply	other threads:[~2025-08-19 18:38 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-30 14:52 parts of pages on NFS being replaced by swaths of NULs Jeff Layton
2025-07-31 21:56 ` Trond Myklebust
2025-07-31 23:04   ` Jeff Layton
2025-08-06 14:20   ` Jeff Layton
2025-08-12 11:58   ` Jeff Layton
2025-08-12 16:58     ` Trond Myklebust
2025-08-12 17:20       ` Jeff Layton
2025-08-16 13:01       ` Jeff Layton
2025-08-16 14:51         ` Trond Myklebust
2025-08-16 15:31           ` Jeff Layton
2025-08-16 17:16           ` Jeff Layton
2025-08-16 21:25           ` Mike Snitzer
2025-08-19 18:38             ` Mike Snitzer [this message]
2025-08-19 20:13               ` Jeff Layton
2025-08-19 21:38                 ` seeing CRC mismatch with NFS DIRECT WRITE when localio_O_DIRECT_align_misaligned_IO=Y Mike Snitzer
2025-08-19 21:50                   ` Jeff Layton
2025-08-19 17:10           ` parts of pages on NFS being replaced by swaths of NULs Jeff Layton
2025-08-19 18:28             ` Trond Myklebust
2025-08-19 19:48               ` Jeff Layton
2025-08-19 20:47                 ` Mike Snitzer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aKTEsmOAXsHEGEEz@kernel.org \
    --to=snitzer@kernel.org \
    --cc=anna@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=jonathan.flynn@hammerspace.com \
    --cc=linux-nfs@vger.kernel.org \
    --cc=trondmy@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.