[PATCH v1] NFSD: Enable return of an updated stable

linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
@ 2025-10-13 19:01 Chuck Lever
  2025-10-13 19:34 ` Jeff Layton
                   ` (4 more replies)
  0 siblings, 5 replies; 23+ messages in thread
From: Chuck Lever @ 2025-10-13 19:01 UTC (permalink / raw)
  To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
  Cc: linux-nfs, Mike Snitzer, Chuck Lever

From: Chuck Lever <chuck.lever@oracle.com>

In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
WRITE to be a FILE_SYNC WRITE. This indicates that the client does
not need a subsequent COMMIT operation, saving a round trip and
allowing the client to dispense with cached dirty data as soon as
it receives the server's WRITE response.

This patch refactors nfsd_vfs_write() to return a possibly modified
stable_how value to its callers. No behavior change is expected.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c |  2 +-
 fs/nfsd/nfs4proc.c |  2 +-
 fs/nfsd/nfsproc.c  |  3 ++-
 fs/nfsd/vfs.c      | 11 ++++++-----
 fs/nfsd/vfs.h      |  6 ++++--
 fs/nfsd/xdr3.h     |  2 +-
 6 files changed, 15 insertions(+), 11 deletions(-)

Here's what I had in mind, based on a patch I did a year ago but
never posted.

If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
it can now set *stable_how and the WRITE encoders will return the
updated value to the client.


diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b6d03e1ef5f7..ad14b34583bb 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
 	resp->committed = argp->stable;
 	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
 				  &argp->payload, &cnt,
-				  resp->committed, resp->verf);
+				  &resp->committed, resp->verf);
 	resp->count = cnt;
 	resp->status = nfsd3_map_status(resp->status);
 	return rpc_success;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7f7e6bb23a90..2222bb283baf 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	write->wr_how_written = write->wr_stable_how;
 	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
 				write->wr_offset, &write->wr_payload,
-				&cnt, write->wr_how_written,
+				&cnt, &write->wr_how_written,
 				(__be32 *)write->wr_verifier.data);
 	nfsd_file_put(nf);
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 8f71f5748c75..706401ed6f8d 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
 	struct nfsd_writeargs *argp = rqstp->rq_argp;
 	struct nfsd_attrstat *resp = rqstp->rq_resp;
 	unsigned long cnt = argp->len;
+	u32 committed = NFS_DATA_SYNC;
 
 	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
-				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
+				  &argp->payload, &cnt, &committed, NULL);
 	if (resp->status == nfs_ok)
 		resp->status = fh_getattr(&resp->fh, &resp->stat);
 	else if (resp->status == nfserr_jukebox)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f537a7b4ee01..8b2dc7a88aab 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
  * @offset: Byte offset of start
  * @payload: xdr_buf containing the write payload
  * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
- * @stable: An NFS stable_how value
+ * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
  * @verf: NFS WRITE verifier
  *
  * Upon return, caller must invoke fh_put on @fhp.
@@ -1274,11 +1274,12 @@ __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	       struct nfsd_file *nf, loff_t offset,
 	       const struct xdr_buf *payload, unsigned long *cnt,
-	       int stable, __be32 *verf)
+	       u32 *stable_how, __be32 *verf)
 {
 	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file		*file = nf->nf_file;
 	struct super_block	*sb = file_inode(file)->i_sb;
+	u32			stable = *stable_how;
 	struct kiocb		kiocb;
 	struct svc_export	*exp;
 	struct iov_iter		iter;
@@ -1434,7 +1435,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * @offset: Byte offset of start
  * @payload: xdr_buf containing the write payload
  * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
- * @stable: An NFS stable_how value
+ * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
  * @verf: NFS WRITE verifier
  *
  * Upon return, caller must invoke fh_put on @fhp.
@@ -1444,7 +1445,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
  */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
-	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
+	   const struct xdr_buf *payload, unsigned long *cnt, u32 *stable_how,
 	   __be32 *verf)
 {
 	struct nfsd_file *nf;
@@ -1457,7 +1458,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 		goto out;
 
 	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
-			     stable, verf);
+			     stable_how, verf);
 	nfsd_file_put(nf);
 out:
 	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fa46f8b5f132..c713ed0b04e0 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -130,11 +130,13 @@ __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				u32 *eof);
 __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				loff_t offset, const struct xdr_buf *payload,
-				unsigned long *cnt, int stable, __be32 *verf);
+				unsigned long *cnt, u32 *stable_how,
+				__be32 *verf);
 __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct nfsd_file *nf, loff_t offset,
 				const struct xdr_buf *payload,
-				unsigned long *cnt, int stable, __be32 *verf);
+				unsigned long *cnt, u32 *stable_how,
+				__be32 *verf);
 __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
 				char *, int *);
 __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 522067b7fd75..c0e443ef3a6b 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -152,7 +152,7 @@ struct nfsd3_writeres {
 	__be32			status;
 	struct svc_fh		fh;
 	unsigned long		count;
-	int			committed;
+	u32			committed;
 	__be32			verf[2];
 };
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-13 19:01 [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
@ 2025-10-13 19:34 ` Jeff Layton
  2025-10-14  5:24 ` NeilBrown
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 23+ messages in thread
From: Jeff Layton @ 2025-10-13 19:34 UTC (permalink / raw)
  To: Chuck Lever, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey
  Cc: linux-nfs, Mike Snitzer, Chuck Lever

On Mon, 2025-10-13 at 15:01 -0400, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> not need a subsequent COMMIT operation, saving a round trip and
> allowing the client to dispense with cached dirty data as soon as
> it receives the server's WRITE response.
> 
> This patch refactors nfsd_vfs_write() to return a possibly modified
> stable_how value to its callers. No behavior change is expected.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs3proc.c |  2 +-
>  fs/nfsd/nfs4proc.c |  2 +-
>  fs/nfsd/nfsproc.c  |  3 ++-
>  fs/nfsd/vfs.c      | 11 ++++++-----
>  fs/nfsd/vfs.h      |  6 ++++--
>  fs/nfsd/xdr3.h     |  2 +-
>  6 files changed, 15 insertions(+), 11 deletions(-)
> 
> Here's what I had in mind, based on a patch I did a year ago but
> never posted.
> 
> If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
> it can now set *stable_how and the WRITE encoders will return the
> updated value to the client.
> 
> 
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index b6d03e1ef5f7..ad14b34583bb 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
>  	resp->committed = argp->stable;
>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>  				  &argp->payload, &cnt,
> -				  resp->committed, resp->verf);
> +				  &resp->committed, resp->verf);
>  	resp->count = cnt;
>  	resp->status = nfsd3_map_status(resp->status);
>  	return rpc_success;
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 7f7e6bb23a90..2222bb283baf 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	write->wr_how_written = write->wr_stable_how;
>  	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
>  				write->wr_offset, &write->wr_payload,
> -				&cnt, write->wr_how_written,
> +				&cnt, &write->wr_how_written,
>  				(__be32 *)write->wr_verifier.data);
>  	nfsd_file_put(nf);
>  
> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> index 8f71f5748c75..706401ed6f8d 100644
> --- a/fs/nfsd/nfsproc.c
> +++ b/fs/nfsd/nfsproc.c
> @@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>  	struct nfsd_writeargs *argp = rqstp->rq_argp;
>  	struct nfsd_attrstat *resp = rqstp->rq_resp;
>  	unsigned long cnt = argp->len;
> +	u32 committed = NFS_DATA_SYNC;
>  
>  	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
>  		SVCFH_fmt(&argp->fh),
> @@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>  
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
> +				  &argp->payload, &cnt, &committed, NULL);
>  	if (resp->status == nfs_ok)
>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
>  	else if (resp->status == nfserr_jukebox)
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index f537a7b4ee01..8b2dc7a88aab 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
>   * @offset: Byte offset of start
>   * @payload: xdr_buf containing the write payload
>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> - * @stable: An NFS stable_how value
> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>   * @verf: NFS WRITE verifier
>   *
>   * Upon return, caller must invoke fh_put on @fhp.
> @@ -1274,11 +1274,12 @@ __be32
>  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	       struct nfsd_file *nf, loff_t offset,
>  	       const struct xdr_buf *payload, unsigned long *cnt,
> -	       int stable, __be32 *verf)
> +	       u32 *stable_how, __be32 *verf)
>  {
>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>  	struct file		*file = nf->nf_file;
>  	struct super_block	*sb = file_inode(file)->i_sb;
> +	u32			stable = *stable_how;
>  	struct kiocb		kiocb;
>  	struct svc_export	*exp;
>  	struct iov_iter		iter;
> @@ -1434,7 +1435,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   * @offset: Byte offset of start
>   * @payload: xdr_buf containing the write payload
>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> - * @stable: An NFS stable_how value
> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>   * @verf: NFS WRITE verifier
>   *
>   * Upon return, caller must invoke fh_put on @fhp.
> @@ -1444,7 +1445,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   */
>  __be32
>  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
> -	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
> +	   const struct xdr_buf *payload, unsigned long *cnt, u32 *stable_how,
>  	   __be32 *verf)
>  {
>  	struct nfsd_file *nf;
> @@ -1457,7 +1458,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>  		goto out;
>  
>  	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
> -			     stable, verf);
> +			     stable_how, verf);
>  	nfsd_file_put(nf);
>  out:
>  	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index fa46f8b5f132..c713ed0b04e0 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -130,11 +130,13 @@ __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				u32 *eof);
>  __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				loff_t offset, const struct xdr_buf *payload,
> -				unsigned long *cnt, int stable, __be32 *verf);
> +				unsigned long *cnt, u32 *stable_how,
> +				__be32 *verf);
>  __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				struct nfsd_file *nf, loff_t offset,
>  				const struct xdr_buf *payload,
> -				unsigned long *cnt, int stable, __be32 *verf);
> +				unsigned long *cnt, u32 *stable_how,
> +				__be32 *verf);
>  __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
>  				char *, int *);
>  __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
> diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
> index 522067b7fd75..c0e443ef3a6b 100644
> --- a/fs/nfsd/xdr3.h
> +++ b/fs/nfsd/xdr3.h
> @@ -152,7 +152,7 @@ struct nfsd3_writeres {
>  	__be32			status;
>  	struct svc_fh		fh;
>  	unsigned long		count;
> -	int			committed;
> +	u32			committed;
>  	__be32			verf[2];
>  };
>  

Looks reasonable, and with this we can have the DIO write path override
what the client requested.

Reviewed-by: Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-13 19:01 [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
  2025-10-13 19:34 ` Jeff Layton
@ 2025-10-14  5:24 ` NeilBrown
  2025-10-15 16:53 ` Mike Snitzer
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 23+ messages in thread
From: NeilBrown @ 2025-10-14  5:24 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Mike Snitzer, Chuck Lever

On Tue, 14 Oct 2025, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> not need a subsequent COMMIT operation, saving a round trip and
> allowing the client to dispense with cached dirty data as soon as
> it receives the server's WRITE response.
> 
> This patch refactors nfsd_vfs_write() to return a possibly modified
> stable_how value to its callers. No behavior change is expected.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

Reviewed-by: NeilBrown <neil@brown.name>

looks good and useful.

NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-13 19:01 [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
  2025-10-13 19:34 ` Jeff Layton
  2025-10-14  5:24 ` NeilBrown
@ 2025-10-15 16:53 ` Mike Snitzer
  2025-10-15 18:00   ` Chuck Lever
  2025-10-15 22:13 ` [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
  2025-10-17  4:08 ` [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Christoph Hellwig
  4 siblings, 1 reply; 23+ messages in thread
From: Mike Snitzer @ 2025-10-15 16:53 UTC (permalink / raw)
  To: Chuck Lever
  Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> not need a subsequent COMMIT operation, saving a round trip and
> allowing the client to dispense with cached dirty data as soon as
> it receives the server's WRITE response.
> 
> This patch refactors nfsd_vfs_write() to return a possibly modified
> stable_how value to its callers. No behavior change is expected.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs3proc.c |  2 +-
>  fs/nfsd/nfs4proc.c |  2 +-
>  fs/nfsd/nfsproc.c  |  3 ++-
>  fs/nfsd/vfs.c      | 11 ++++++-----
>  fs/nfsd/vfs.h      |  6 ++++--
>  fs/nfsd/xdr3.h     |  2 +-
>  6 files changed, 15 insertions(+), 11 deletions(-)
> 
> Here's what I had in mind, based on a patch I did a year ago but
> never posted.
> 
> If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
> it can now set *stable_how and the WRITE encoders will return the
> updated value to the client.
> 
> 
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index b6d03e1ef5f7..ad14b34583bb 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
>  	resp->committed = argp->stable;
>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>  				  &argp->payload, &cnt,
> -				  resp->committed, resp->verf);
> +				  &resp->committed, resp->verf);
>  	resp->count = cnt;
>  	resp->status = nfsd3_map_status(resp->status);
>  	return rpc_success;
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 7f7e6bb23a90..2222bb283baf 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	write->wr_how_written = write->wr_stable_how;
>  	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
>  				write->wr_offset, &write->wr_payload,
> -				&cnt, write->wr_how_written,
> +				&cnt, &write->wr_how_written,
>  				(__be32 *)write->wr_verifier.data);
>  	nfsd_file_put(nf);
>  
> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> index 8f71f5748c75..706401ed6f8d 100644
> --- a/fs/nfsd/nfsproc.c
> +++ b/fs/nfsd/nfsproc.c
> @@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>  	struct nfsd_writeargs *argp = rqstp->rq_argp;
>  	struct nfsd_attrstat *resp = rqstp->rq_resp;
>  	unsigned long cnt = argp->len;
> +	u32 committed = NFS_DATA_SYNC;
>  
>  	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
>  		SVCFH_fmt(&argp->fh),
> @@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>  
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
> +				  &argp->payload, &cnt, &committed, NULL);
>  	if (resp->status == nfs_ok)
>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
>  	else if (resp->status == nfserr_jukebox)
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index f537a7b4ee01..8b2dc7a88aab 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
>   * @offset: Byte offset of start
>   * @payload: xdr_buf containing the write payload
>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> - * @stable: An NFS stable_how value
> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>   * @verf: NFS WRITE verifier
>   *
>   * Upon return, caller must invoke fh_put on @fhp.
> @@ -1274,11 +1274,12 @@ __be32
>  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	       struct nfsd_file *nf, loff_t offset,
>  	       const struct xdr_buf *payload, unsigned long *cnt,
> -	       int stable, __be32 *verf)
> +	       u32 *stable_how, __be32 *verf)
>  {
>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>  	struct file		*file = nf->nf_file;
>  	struct super_block	*sb = file_inode(file)->i_sb;
> +	u32			stable = *stable_how;
>  	struct kiocb		kiocb;
>  	struct svc_export	*exp;
>  	struct iov_iter		iter;

Seems we need this instead here?

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5e5e5187d2e5..d0c89f8fdb57 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1479,7 +1479,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file		*file = nf->nf_file;
 	struct super_block	*sb = file_inode(file)->i_sb;
-	u32			stable = *stable_how;
+	u32			stable;
 	struct kiocb		kiocb;
 	struct svc_export	*exp;
 	errseq_t		since;
@@ -1511,7 +1511,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	exp = fhp->fh_export;
 
 	if (!EX_ISSYNC(exp))
-		stable = NFS_UNSTABLE;
+		*stable_how = NFS_UNSTABLE;
+	stable = *stable_how;
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = offset;
 	if (stable && !fhp->fh_use_wgather)

Otherwise, isn't there potential for nfsd_vfs_write's NFS_UNSTABLE
override to cause a client set stable_how, that was set to something
other than NFS_UNSTABLE, to silently _not_ be respected by NFSD? But
client could assume COMMIT not needed? And does this then elevate this
patch to be a stable@ fix? (no pun intended).

If not, please help me understand why.

Thanks!

> @@ -1434,7 +1435,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   * @offset: Byte offset of start
>   * @payload: xdr_buf containing the write payload
>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> - * @stable: An NFS stable_how value
> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>   * @verf: NFS WRITE verifier
>   *
>   * Upon return, caller must invoke fh_put on @fhp.
> @@ -1444,7 +1445,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   */
>  __be32
>  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
> -	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
> +	   const struct xdr_buf *payload, unsigned long *cnt, u32 *stable_how,
>  	   __be32 *verf)
>  {
>  	struct nfsd_file *nf;
> @@ -1457,7 +1458,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>  		goto out;
>  
>  	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
> -			     stable, verf);
> +			     stable_how, verf);
>  	nfsd_file_put(nf);
>  out:
>  	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index fa46f8b5f132..c713ed0b04e0 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -130,11 +130,13 @@ __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				u32 *eof);
>  __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				loff_t offset, const struct xdr_buf *payload,
> -				unsigned long *cnt, int stable, __be32 *verf);
> +				unsigned long *cnt, u32 *stable_how,
> +				__be32 *verf);
>  __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				struct nfsd_file *nf, loff_t offset,
>  				const struct xdr_buf *payload,
> -				unsigned long *cnt, int stable, __be32 *verf);
> +				unsigned long *cnt, u32 *stable_how,
> +				__be32 *verf);
>  __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
>  				char *, int *);
>  __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
> diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
> index 522067b7fd75..c0e443ef3a6b 100644
> --- a/fs/nfsd/xdr3.h
> +++ b/fs/nfsd/xdr3.h
> @@ -152,7 +152,7 @@ struct nfsd3_writeres {
>  	__be32			status;
>  	struct svc_fh		fh;
>  	unsigned long		count;
> -	int			committed;
> +	u32			committed;
>  	__be32			verf[2];
>  };
>  
> -- 
> 2.51.0
> 

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-15 16:53 ` Mike Snitzer
@ 2025-10-15 18:00   ` Chuck Lever
  2025-10-15 18:11     ` Jeff Layton
  2025-10-15 18:40     ` Mike Snitzer
  0 siblings, 2 replies; 23+ messages in thread
From: Chuck Lever @ 2025-10-15 18:00 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On 10/15/25 12:53 PM, Mike Snitzer wrote:
> On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>>
>> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
>> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
>> not need a subsequent COMMIT operation, saving a round trip and
>> allowing the client to dispense with cached dirty data as soon as
>> it receives the server's WRITE response.
>>
>> This patch refactors nfsd_vfs_write() to return a possibly modified
>> stable_how value to its callers. No behavior change is expected.
>>
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>  fs/nfsd/nfs3proc.c |  2 +-
>>  fs/nfsd/nfs4proc.c |  2 +-
>>  fs/nfsd/nfsproc.c  |  3 ++-
>>  fs/nfsd/vfs.c      | 11 ++++++-----
>>  fs/nfsd/vfs.h      |  6 ++++--
>>  fs/nfsd/xdr3.h     |  2 +-
>>  6 files changed, 15 insertions(+), 11 deletions(-)
>>
>> Here's what I had in mind, based on a patch I did a year ago but
>> never posted.
>>
>> If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
>> it can now set *stable_how and the WRITE encoders will return the
>> updated value to the client.
>>
>>
>> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
>> index b6d03e1ef5f7..ad14b34583bb 100644
>> --- a/fs/nfsd/nfs3proc.c
>> +++ b/fs/nfsd/nfs3proc.c
>> @@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
>>  	resp->committed = argp->stable;
>>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>>  				  &argp->payload, &cnt,
>> -				  resp->committed, resp->verf);
>> +				  &resp->committed, resp->verf);
>>  	resp->count = cnt;
>>  	resp->status = nfsd3_map_status(resp->status);
>>  	return rpc_success;
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 7f7e6bb23a90..2222bb283baf 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>  	write->wr_how_written = write->wr_stable_how;
>>  	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
>>  				write->wr_offset, &write->wr_payload,
>> -				&cnt, write->wr_how_written,
>> +				&cnt, &write->wr_how_written,
>>  				(__be32 *)write->wr_verifier.data);
>>  	nfsd_file_put(nf);
>>  
>> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
>> index 8f71f5748c75..706401ed6f8d 100644
>> --- a/fs/nfsd/nfsproc.c
>> +++ b/fs/nfsd/nfsproc.c
>> @@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>>  	struct nfsd_writeargs *argp = rqstp->rq_argp;
>>  	struct nfsd_attrstat *resp = rqstp->rq_resp;
>>  	unsigned long cnt = argp->len;
>> +	u32 committed = NFS_DATA_SYNC;
>>  
>>  	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
>>  		SVCFH_fmt(&argp->fh),
>> @@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>>  
>>  	fh_copy(&resp->fh, &argp->fh);
>>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>> -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
>> +				  &argp->payload, &cnt, &committed, NULL);
>>  	if (resp->status == nfs_ok)
>>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
>>  	else if (resp->status == nfserr_jukebox)
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index f537a7b4ee01..8b2dc7a88aab 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
>>   * @offset: Byte offset of start
>>   * @payload: xdr_buf containing the write payload
>>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
>> - * @stable: An NFS stable_how value
>> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>>   * @verf: NFS WRITE verifier
>>   *
>>   * Upon return, caller must invoke fh_put on @fhp.
>> @@ -1274,11 +1274,12 @@ __be32
>>  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  	       struct nfsd_file *nf, loff_t offset,
>>  	       const struct xdr_buf *payload, unsigned long *cnt,
>> -	       int stable, __be32 *verf)
>> +	       u32 *stable_how, __be32 *verf)
>>  {
>>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>>  	struct file		*file = nf->nf_file;
>>  	struct super_block	*sb = file_inode(file)->i_sb;
>> +	u32			stable = *stable_how;
>>  	struct kiocb		kiocb;
>>  	struct svc_export	*exp;
>>  	struct iov_iter		iter;
> 
> Seems we need this instead here?
> 
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 5e5e5187d2e5..d0c89f8fdb57 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1479,7 +1479,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>  	struct file		*file = nf->nf_file;
>  	struct super_block	*sb = file_inode(file)->i_sb;
> -	u32			stable = *stable_how;
> +	u32			stable;
>  	struct kiocb		kiocb;
>  	struct svc_export	*exp;
>  	errseq_t		since;
> @@ -1511,7 +1511,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	exp = fhp->fh_export;
>  
>  	if (!EX_ISSYNC(exp))
> -		stable = NFS_UNSTABLE;
> +		*stable_how = NFS_UNSTABLE;
> +	stable = *stable_how;
>  	init_sync_kiocb(&kiocb, file);
>  	kiocb.ki_pos = offset;
>  	if (stable && !fhp->fh_use_wgather)
> 
> Otherwise, isn't there potential for nfsd_vfs_write's NFS_UNSTABLE
> override to cause a client set stable_how, that was set to something
> other than NFS_UNSTABLE, to silently _not_ be respected by NFSD? But
> client could assume COMMIT not needed? And does this then elevate this
> patch to be a stable@ fix? (no pun intended).
> 
> If not, please help me understand why.

The protocol permits an NFS server to change the stable_how field in a
WRITE response as follows:

UNSTABLE  -> DATA_SYNC
UNSTABLE  -> FILE_SYNC
DATA_SYNC -> FILE_SYNC

It forbids the reverse direction. A client that asks for a FILE_SYNC
WRITE MUST get a FILE_SYNC reply.

What the EX_ISSYNC test is doing is looking for the "async" export
option. When that option is specified, internally NFSD converts all
WRITEs, including FILE_SYNC WRITEs, to UNSTABLE. It then complies with
the protocol by not telling the client about this invalid change and
defies the protocol by not ensuring FILE_SYNC WRITEs are persisted
before replying. See exports(5).

In these cases, each WRITE response still reflects what the client
requested, but it does not reflect what the server actually did.

We tell anyone who will listen (and even those who won't) never to use
the "async" export option because of the silent data corruption risk it
introduces. But it goes faster than using the fully protocol-compliant
"sync" export option, so people use it anyway.


> Thanks!
> 
>> @@ -1434,7 +1435,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>   * @offset: Byte offset of start
>>   * @payload: xdr_buf containing the write payload
>>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
>> - * @stable: An NFS stable_how value
>> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>>   * @verf: NFS WRITE verifier
>>   *
>>   * Upon return, caller must invoke fh_put on @fhp.
>> @@ -1444,7 +1445,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>   */
>>  __be32
>>  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>> -	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
>> +	   const struct xdr_buf *payload, unsigned long *cnt, u32 *stable_how,
>>  	   __be32 *verf)
>>  {
>>  	struct nfsd_file *nf;
>> @@ -1457,7 +1458,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>>  		goto out;
>>  
>>  	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
>> -			     stable, verf);
>> +			     stable_how, verf);
>>  	nfsd_file_put(nf);
>>  out:
>>  	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
>> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
>> index fa46f8b5f132..c713ed0b04e0 100644
>> --- a/fs/nfsd/vfs.h
>> +++ b/fs/nfsd/vfs.h
>> @@ -130,11 +130,13 @@ __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  				u32 *eof);
>>  __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  				loff_t offset, const struct xdr_buf *payload,
>> -				unsigned long *cnt, int stable, __be32 *verf);
>> +				unsigned long *cnt, u32 *stable_how,
>> +				__be32 *verf);
>>  __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  				struct nfsd_file *nf, loff_t offset,
>>  				const struct xdr_buf *payload,
>> -				unsigned long *cnt, int stable, __be32 *verf);
>> +				unsigned long *cnt, u32 *stable_how,
>> +				__be32 *verf);
>>  __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
>>  				char *, int *);
>>  __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
>> diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
>> index 522067b7fd75..c0e443ef3a6b 100644
>> --- a/fs/nfsd/xdr3.h
>> +++ b/fs/nfsd/xdr3.h
>> @@ -152,7 +152,7 @@ struct nfsd3_writeres {
>>  	__be32			status;
>>  	struct svc_fh		fh;
>>  	unsigned long		count;
>> -	int			committed;
>> +	u32			committed;
>>  	__be32			verf[2];
>>  };
>>  
>> -- 
>> 2.51.0
>>


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-15 18:00   ` Chuck Lever
@ 2025-10-15 18:11     ` Jeff Layton
  2025-10-15 18:16       ` Chuck Lever
  2025-10-17  6:22       ` NeilBrown
  2025-10-15 18:40     ` Mike Snitzer
  1 sibling, 2 replies; 23+ messages in thread
From: Jeff Layton @ 2025-10-15 18:11 UTC (permalink / raw)
  To: Chuck Lever, Mike Snitzer
  Cc: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Chuck Lever

On Wed, 2025-10-15 at 14:00 -0400, Chuck Lever wrote:
> On 10/15/25 12:53 PM, Mike Snitzer wrote:
> > On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
> > > From: Chuck Lever <chuck.lever@oracle.com>
> > > 
> > > In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> > > WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> > > not need a subsequent COMMIT operation, saving a round trip and
> > > allowing the client to dispense with cached dirty data as soon as
> > > it receives the server's WRITE response.
> > > 
> > > This patch refactors nfsd_vfs_write() to return a possibly modified
> > > stable_how value to its callers. No behavior change is expected.
> > > 
> > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > ---
> > >  fs/nfsd/nfs3proc.c |  2 +-
> > >  fs/nfsd/nfs4proc.c |  2 +-
> > >  fs/nfsd/nfsproc.c  |  3 ++-
> > >  fs/nfsd/vfs.c      | 11 ++++++-----
> > >  fs/nfsd/vfs.h      |  6 ++++--
> > >  fs/nfsd/xdr3.h     |  2 +-
> > >  6 files changed, 15 insertions(+), 11 deletions(-)
> > > 
> > > Here's what I had in mind, based on a patch I did a year ago but
> > > never posted.
> > > 
> > > If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
> > > it can now set *stable_how and the WRITE encoders will return the
> > > updated value to the client.
> > > 
> > > 
> > > diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> > > index b6d03e1ef5f7..ad14b34583bb 100644
> > > --- a/fs/nfsd/nfs3proc.c
> > > +++ b/fs/nfsd/nfs3proc.c
> > > @@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
> > >  	resp->committed = argp->stable;
> > >  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> > >  				  &argp->payload, &cnt,
> > > -				  resp->committed, resp->verf);
> > > +				  &resp->committed, resp->verf);
> > >  	resp->count = cnt;
> > >  	resp->status = nfsd3_map_status(resp->status);
> > >  	return rpc_success;
> > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > index 7f7e6bb23a90..2222bb283baf 100644
> > > --- a/fs/nfsd/nfs4proc.c
> > > +++ b/fs/nfsd/nfs4proc.c
> > > @@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> > >  	write->wr_how_written = write->wr_stable_how;
> > >  	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
> > >  				write->wr_offset, &write->wr_payload,
> > > -				&cnt, write->wr_how_written,
> > > +				&cnt, &write->wr_how_written,
> > >  				(__be32 *)write->wr_verifier.data);
> > >  	nfsd_file_put(nf);
> > >  
> > > diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> > > index 8f71f5748c75..706401ed6f8d 100644
> > > --- a/fs/nfsd/nfsproc.c
> > > +++ b/fs/nfsd/nfsproc.c
> > > @@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
> > >  	struct nfsd_writeargs *argp = rqstp->rq_argp;
> > >  	struct nfsd_attrstat *resp = rqstp->rq_resp;
> > >  	unsigned long cnt = argp->len;
> > > +	u32 committed = NFS_DATA_SYNC;
> > >  
> > >  	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
> > >  		SVCFH_fmt(&argp->fh),
> > > @@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
> > >  
> > >  	fh_copy(&resp->fh, &argp->fh);
> > >  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> > > -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
> > > +				  &argp->payload, &cnt, &committed, NULL);
> > >  	if (resp->status == nfs_ok)
> > >  		resp->status = fh_getattr(&resp->fh, &resp->stat);
> > >  	else if (resp->status == nfserr_jukebox)
> > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > index f537a7b4ee01..8b2dc7a88aab 100644
> > > --- a/fs/nfsd/vfs.c
> > > +++ b/fs/nfsd/vfs.c
> > > @@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
> > >   * @offset: Byte offset of start
> > >   * @payload: xdr_buf containing the write payload
> > >   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> > > - * @stable: An NFS stable_how value
> > > + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
> > >   * @verf: NFS WRITE verifier
> > >   *
> > >   * Upon return, caller must invoke fh_put on @fhp.
> > > @@ -1274,11 +1274,12 @@ __be32
> > >  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  	       struct nfsd_file *nf, loff_t offset,
> > >  	       const struct xdr_buf *payload, unsigned long *cnt,
> > > -	       int stable, __be32 *verf)
> > > +	       u32 *stable_how, __be32 *verf)
> > >  {
> > >  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
> > >  	struct file		*file = nf->nf_file;
> > >  	struct super_block	*sb = file_inode(file)->i_sb;
> > > +	u32			stable = *stable_how;
> > >  	struct kiocb		kiocb;
> > >  	struct svc_export	*exp;
> > >  	struct iov_iter		iter;
> > 
> > Seems we need this instead here?
> > 
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index 5e5e5187d2e5..d0c89f8fdb57 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -1479,7 +1479,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
> >  	struct file		*file = nf->nf_file;
> >  	struct super_block	*sb = file_inode(file)->i_sb;
> > -	u32			stable = *stable_how;
> > +	u32			stable;
> >  	struct kiocb		kiocb;
> >  	struct svc_export	*exp;
> >  	errseq_t		since;
> > @@ -1511,7 +1511,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  	exp = fhp->fh_export;
> >  
> >  	if (!EX_ISSYNC(exp))
> > -		stable = NFS_UNSTABLE;
> > +		*stable_how = NFS_UNSTABLE;
> > +	stable = *stable_how;
> >  	init_sync_kiocb(&kiocb, file);
> >  	kiocb.ki_pos = offset;
> >  	if (stable && !fhp->fh_use_wgather)
> > 
> > Otherwise, isn't there potential for nfsd_vfs_write's NFS_UNSTABLE
> > override to cause a client set stable_how, that was set to something
> > other than NFS_UNSTABLE, to silently _not_ be respected by NFSD? But
> > client could assume COMMIT not needed? And does this then elevate this
> > patch to be a stable@ fix? (no pun intended).
> > 
> > If not, please help me understand why.
> 
> The protocol permits an NFS server to change the stable_how field in a
> WRITE response as follows:
> 
> UNSTABLE  -> DATA_SYNC
> UNSTABLE  -> FILE_SYNC
> DATA_SYNC -> FILE_SYNC
> 
> It forbids the reverse direction. A client that asks for a FILE_SYNC
> WRITE MUST get a FILE_SYNC reply.
> 
> What the EX_ISSYNC test is doing is looking for the "async" export
> option. When that option is specified, internally NFSD converts all
> WRITEs, including FILE_SYNC WRITEs, to UNSTABLE. It then complies with
> the protocol by not telling the client about this invalid change and
> defies the protocol by not ensuring FILE_SYNC WRITEs are persisted
> before replying. See exports(5).
> 
> In these cases, each WRITE response still reflects what the client
> requested, but it does not reflect what the server actually did.
> 
> We tell anyone who will listen (and even those who won't) never to use
> the "async" export option because of the silent data corruption risk it
> introduces. But it goes faster than using the fully protocol-compliant
> "sync" export option, so people use it anyway.
> 
> 

Somewhat related question:

Since we're on track to deprecate NFSv2 support soon, should we be
looking at deprecating the "async" export option too? v2 was the main
reason for it in the first place, after all.

> > Thanks!
> > 
> > > @@ -1434,7 +1435,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >   * @offset: Byte offset of start
> > >   * @payload: xdr_buf containing the write payload
> > >   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> > > - * @stable: An NFS stable_how value
> > > + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
> > >   * @verf: NFS WRITE verifier
> > >   *
> > >   * Upon return, caller must invoke fh_put on @fhp.
> > > @@ -1444,7 +1445,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >   */
> > >  __be32
> > >  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
> > > -	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
> > > +	   const struct xdr_buf *payload, unsigned long *cnt, u32 *stable_how,
> > >  	   __be32 *verf)
> > >  {
> > >  	struct nfsd_file *nf;
> > > @@ -1457,7 +1458,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
> > >  		goto out;
> > >  
> > >  	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
> > > -			     stable, verf);
> > > +			     stable_how, verf);
> > >  	nfsd_file_put(nf);
> > >  out:
> > >  	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
> > > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> > > index fa46f8b5f132..c713ed0b04e0 100644
> > > --- a/fs/nfsd/vfs.h
> > > +++ b/fs/nfsd/vfs.h
> > > @@ -130,11 +130,13 @@ __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  				u32 *eof);
> > >  __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  				loff_t offset, const struct xdr_buf *payload,
> > > -				unsigned long *cnt, int stable, __be32 *verf);
> > > +				unsigned long *cnt, u32 *stable_how,
> > > +				__be32 *verf);
> > >  __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  				struct nfsd_file *nf, loff_t offset,
> > >  				const struct xdr_buf *payload,
> > > -				unsigned long *cnt, int stable, __be32 *verf);
> > > +				unsigned long *cnt, u32 *stable_how,
> > > +				__be32 *verf);
> > >  __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
> > >  				char *, int *);
> > >  __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
> > > diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
> > > index 522067b7fd75..c0e443ef3a6b 100644
> > > --- a/fs/nfsd/xdr3.h
> > > +++ b/fs/nfsd/xdr3.h
> > > @@ -152,7 +152,7 @@ struct nfsd3_writeres {
> > >  	__be32			status;
> > >  	struct svc_fh		fh;
> > >  	unsigned long		count;
> > > -	int			committed;
> > > +	u32			committed;
> > >  	__be32			verf[2];
> > >  };
> > >  
> > > -- 
> > > 2.51.0
> > > 
> 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-15 18:11     ` Jeff Layton
@ 2025-10-15 18:16       ` Chuck Lever
  2025-10-17  6:22       ` NeilBrown
  1 sibling, 0 replies; 23+ messages in thread
From: Chuck Lever @ 2025-10-15 18:16 UTC (permalink / raw)
  To: Jeff Layton, Mike Snitzer
  Cc: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Chuck Lever

On 10/15/25 2:11 PM, Jeff Layton wrote:
> On Wed, 2025-10-15 at 14:00 -0400, Chuck Lever wrote:
>> On 10/15/25 12:53 PM, Mike Snitzer wrote:
>>> On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>
>>>> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
>>>> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
>>>> not need a subsequent COMMIT operation, saving a round trip and
>>>> allowing the client to dispense with cached dirty data as soon as
>>>> it receives the server's WRITE response.
>>>>
>>>> This patch refactors nfsd_vfs_write() to return a possibly modified
>>>> stable_how value to its callers. No behavior change is expected.
>>>>
>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>>  fs/nfsd/nfs3proc.c |  2 +-
>>>>  fs/nfsd/nfs4proc.c |  2 +-
>>>>  fs/nfsd/nfsproc.c  |  3 ++-
>>>>  fs/nfsd/vfs.c      | 11 ++++++-----
>>>>  fs/nfsd/vfs.h      |  6 ++++--
>>>>  fs/nfsd/xdr3.h     |  2 +-
>>>>  6 files changed, 15 insertions(+), 11 deletions(-)
>>>>
>>>> Here's what I had in mind, based on a patch I did a year ago but
>>>> never posted.
>>>>
>>>> If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
>>>> it can now set *stable_how and the WRITE encoders will return the
>>>> updated value to the client.
>>>>
>>>>
>>>> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
>>>> index b6d03e1ef5f7..ad14b34583bb 100644
>>>> --- a/fs/nfsd/nfs3proc.c
>>>> +++ b/fs/nfsd/nfs3proc.c
>>>> @@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
>>>>  	resp->committed = argp->stable;
>>>>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>>>>  				  &argp->payload, &cnt,
>>>> -				  resp->committed, resp->verf);
>>>> +				  &resp->committed, resp->verf);
>>>>  	resp->count = cnt;
>>>>  	resp->status = nfsd3_map_status(resp->status);
>>>>  	return rpc_success;
>>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>>>> index 7f7e6bb23a90..2222bb283baf 100644
>>>> --- a/fs/nfsd/nfs4proc.c
>>>> +++ b/fs/nfsd/nfs4proc.c
>>>> @@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>>  	write->wr_how_written = write->wr_stable_how;
>>>>  	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
>>>>  				write->wr_offset, &write->wr_payload,
>>>> -				&cnt, write->wr_how_written,
>>>> +				&cnt, &write->wr_how_written,
>>>>  				(__be32 *)write->wr_verifier.data);
>>>>  	nfsd_file_put(nf);
>>>>  
>>>> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
>>>> index 8f71f5748c75..706401ed6f8d 100644
>>>> --- a/fs/nfsd/nfsproc.c
>>>> +++ b/fs/nfsd/nfsproc.c
>>>> @@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>>>>  	struct nfsd_writeargs *argp = rqstp->rq_argp;
>>>>  	struct nfsd_attrstat *resp = rqstp->rq_resp;
>>>>  	unsigned long cnt = argp->len;
>>>> +	u32 committed = NFS_DATA_SYNC;
>>>>  
>>>>  	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
>>>>  		SVCFH_fmt(&argp->fh),
>>>> @@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>>>>  
>>>>  	fh_copy(&resp->fh, &argp->fh);
>>>>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>>>> -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
>>>> +				  &argp->payload, &cnt, &committed, NULL);
>>>>  	if (resp->status == nfs_ok)
>>>>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
>>>>  	else if (resp->status == nfserr_jukebox)
>>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>>> index f537a7b4ee01..8b2dc7a88aab 100644
>>>> --- a/fs/nfsd/vfs.c
>>>> +++ b/fs/nfsd/vfs.c
>>>> @@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
>>>>   * @offset: Byte offset of start
>>>>   * @payload: xdr_buf containing the write payload
>>>>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
>>>> - * @stable: An NFS stable_how value
>>>> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>>>>   * @verf: NFS WRITE verifier
>>>>   *
>>>>   * Upon return, caller must invoke fh_put on @fhp.
>>>> @@ -1274,11 +1274,12 @@ __be32
>>>>  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>>  	       struct nfsd_file *nf, loff_t offset,
>>>>  	       const struct xdr_buf *payload, unsigned long *cnt,
>>>> -	       int stable, __be32 *verf)
>>>> +	       u32 *stable_how, __be32 *verf)
>>>>  {
>>>>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>>>>  	struct file		*file = nf->nf_file;
>>>>  	struct super_block	*sb = file_inode(file)->i_sb;
>>>> +	u32			stable = *stable_how;
>>>>  	struct kiocb		kiocb;
>>>>  	struct svc_export	*exp;
>>>>  	struct iov_iter		iter;
>>>
>>> Seems we need this instead here?
>>>
>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>> index 5e5e5187d2e5..d0c89f8fdb57 100644
>>> --- a/fs/nfsd/vfs.c
>>> +++ b/fs/nfsd/vfs.c
>>> @@ -1479,7 +1479,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>>>  	struct file		*file = nf->nf_file;
>>>  	struct super_block	*sb = file_inode(file)->i_sb;
>>> -	u32			stable = *stable_how;
>>> +	u32			stable;
>>>  	struct kiocb		kiocb;
>>>  	struct svc_export	*exp;
>>>  	errseq_t		since;
>>> @@ -1511,7 +1511,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>  	exp = fhp->fh_export;
>>>  
>>>  	if (!EX_ISSYNC(exp))
>>> -		stable = NFS_UNSTABLE;
>>> +		*stable_how = NFS_UNSTABLE;
>>> +	stable = *stable_how;
>>>  	init_sync_kiocb(&kiocb, file);
>>>  	kiocb.ki_pos = offset;
>>>  	if (stable && !fhp->fh_use_wgather)
>>>
>>> Otherwise, isn't there potential for nfsd_vfs_write's NFS_UNSTABLE
>>> override to cause a client set stable_how, that was set to something
>>> other than NFS_UNSTABLE, to silently _not_ be respected by NFSD? But
>>> client could assume COMMIT not needed? And does this then elevate this
>>> patch to be a stable@ fix? (no pun intended).
>>>
>>> If not, please help me understand why.
>>
>> The protocol permits an NFS server to change the stable_how field in a
>> WRITE response as follows:
>>
>> UNSTABLE  -> DATA_SYNC
>> UNSTABLE  -> FILE_SYNC
>> DATA_SYNC -> FILE_SYNC
>>
>> It forbids the reverse direction. A client that asks for a FILE_SYNC
>> WRITE MUST get a FILE_SYNC reply.
>>
>> What the EX_ISSYNC test is doing is looking for the "async" export
>> option. When that option is specified, internally NFSD converts all
>> WRITEs, including FILE_SYNC WRITEs, to UNSTABLE. It then complies with
>> the protocol by not telling the client about this invalid change and
>> defies the protocol by not ensuring FILE_SYNC WRITEs are persisted
>> before replying. See exports(5).
>>
>> In these cases, each WRITE response still reflects what the client
>> requested, but it does not reflect what the server actually did.
>>
>> We tell anyone who will listen (and even those who won't) never to use
>> the "async" export option because of the silent data corruption risk it
>> introduces. But it goes faster than using the fully protocol-compliant
>> "sync" export option, so people use it anyway.
>>
>>
> 
> Somewhat related question:
> 
> Since we're on track to deprecate NFSv2 support soon, should we be
> looking at deprecating the "async" export option too? v2 was the main
> reason for it in the first place, after all.

Based on my experience with a few folks who have productized NFSD, I
expect there would be howls of protest. "async" helps even the two-
phase WRITE in newer NFS versions perform faster.

I've made the point that modern persistent storage is so fast that
"async" is pretty meaningless. Some folks still have slow storage
that they need to share via NFS, however.


>>> Thanks!
>>>
>>>> @@ -1434,7 +1435,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>>   * @offset: Byte offset of start
>>>>   * @payload: xdr_buf containing the write payload
>>>>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
>>>> - * @stable: An NFS stable_how value
>>>> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
>>>>   * @verf: NFS WRITE verifier
>>>>   *
>>>>   * Upon return, caller must invoke fh_put on @fhp.
>>>> @@ -1444,7 +1445,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>>   */
>>>>  __be32
>>>>  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>>>> -	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
>>>> +	   const struct xdr_buf *payload, unsigned long *cnt, u32 *stable_how,
>>>>  	   __be32 *verf)
>>>>  {
>>>>  	struct nfsd_file *nf;
>>>> @@ -1457,7 +1458,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>>>>  		goto out;
>>>>  
>>>>  	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
>>>> -			     stable, verf);
>>>> +			     stable_how, verf);
>>>>  	nfsd_file_put(nf);
>>>>  out:
>>>>  	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
>>>> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
>>>> index fa46f8b5f132..c713ed0b04e0 100644
>>>> --- a/fs/nfsd/vfs.h
>>>> +++ b/fs/nfsd/vfs.h
>>>> @@ -130,11 +130,13 @@ __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>>  				u32 *eof);
>>>>  __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>>  				loff_t offset, const struct xdr_buf *payload,
>>>> -				unsigned long *cnt, int stable, __be32 *verf);
>>>> +				unsigned long *cnt, u32 *stable_how,
>>>> +				__be32 *verf);
>>>>  __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>>  				struct nfsd_file *nf, loff_t offset,
>>>>  				const struct xdr_buf *payload,
>>>> -				unsigned long *cnt, int stable, __be32 *verf);
>>>> +				unsigned long *cnt, u32 *stable_how,
>>>> +				__be32 *verf);
>>>>  __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
>>>>  				char *, int *);
>>>>  __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
>>>> diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
>>>> index 522067b7fd75..c0e443ef3a6b 100644
>>>> --- a/fs/nfsd/xdr3.h
>>>> +++ b/fs/nfsd/xdr3.h
>>>> @@ -152,7 +152,7 @@ struct nfsd3_writeres {
>>>>  	__be32			status;
>>>>  	struct svc_fh		fh;
>>>>  	unsigned long		count;
>>>> -	int			committed;
>>>> +	u32			committed;
>>>>  	__be32			verf[2];
>>>>  };
>>>>  
>>>> -- 
>>>> 2.51.0
>>>>
>>
> 

-- 
Chuck Lever


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-15 18:11     ` Jeff Layton
  2025-10-15 18:16       ` Chuck Lever
@ 2025-10-17  6:22       ` NeilBrown
  2025-10-17 11:09         ` Jeff Layton
  2025-10-17 13:43         ` Chuck Lever
  1 sibling, 2 replies; 23+ messages in thread
From: NeilBrown @ 2025-10-17  6:22 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Chuck Lever, Mike Snitzer, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Thu, 16 Oct 2025, Jeff Layton wrote:
> 
> Somewhat related question:
> 
> Since we're on track to deprecate NFSv2 support soon, should we be
> looking at deprecating the "async" export option too? v2 was the main
> reason for it in the first place, after all.

Are we?  Is that justified?

We at SUSE had a customer who had some old but still perfectly
functional industrial equipment which wrote logs using NFSv2.
So we had to revert the nfs-utils changes which disabled NFSv2.
It would be nice if we/they didn't have to do that to the kernel was
well.

What is the down-side of continuing v2 support?  The test matrix isn't
that big.  Of course we need to revert the nfs-utils changes to continue
testing.  I'd be in favour of that anyway.

And async can still have a place for the hobbyist.  If a human is
overseeing a process and is prepared to deal with a server crash if it
happens, but doesn't want to be slowed down by the cost of being careful
just-in-case, then async is a perfectly rational choice.

I'm not in favour of deprecating things that work.

NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-17  6:22       ` NeilBrown
@ 2025-10-17 11:09         ` Jeff Layton
  2025-10-17 23:15           ` NeilBrown
  2025-10-17 13:43         ` Chuck Lever
  1 sibling, 1 reply; 23+ messages in thread
From: Jeff Layton @ 2025-10-17 11:09 UTC (permalink / raw)
  To: NeilBrown
  Cc: Chuck Lever, Mike Snitzer, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Fri, 2025-10-17 at 17:22 +1100, NeilBrown wrote:
> On Thu, 16 Oct 2025, Jeff Layton wrote:
> > 
> > Somewhat related question:
> > 
> > Since we're on track to deprecate NFSv2 support soon, should we be
> > looking at deprecating the "async" export option too? v2 was the main
> > reason for it in the first place, after all.
> 
> Are we?  Is that justified?
> 
> We at SUSE had a customer who had some old but still perfectly
> functional industrial equipment which wrote logs using NFSv2.
> So we had to revert the nfs-utils changes which disabled NFSv2.
> It would be nice if we/they didn't have to do that to the kernel was
> well.
> 
> What is the down-side of continuing v2 support?  The test matrix isn't
> that big.  Of course we need to revert the nfs-utils changes to continue
> testing.  I'd be in favour of that anyway.
> 
> And async can still have a place for the hobbyist.  If a human is
> overseeing a process and is prepared to deal with a server crash if it
> happens, but doesn't want to be slowed down by the cost of being careful
> just-in-case, then async is a perfectly rational choice.
> 
> I'm not in favour of deprecating things that work.
> 
> NeilBrown

I think we should. We deprecate obsolete drivers and (even CPU
architectures!) all the time. Arnd has even started discussing
deprecating 32-bit CPU support in the kernel altogether. Why would we
not deprecate obsolete protocols too? The handful of people that still
need v2 support can boot to an old kernel.

Supporting NFSv2 is a maintenance burden (albeit a mostly minor one at
this point). If we want to properly support it, it has to be tested,
which I don't think anyone is doing currently. I only have so much time
to spend on upstream maintenance, and I don't care to spend it
supporting v2.

SuSE's customer using ancient equipment is an exception here, but I
don't think that's enough to justify all of us spending time keeping v2
limping along.

Do you have the same attachment to NFSv4.0? We had a discussion around
starting to deprecate it as well. Having to support v4.0 _is_ a
significant maintenance burden, IMO.
--
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-17 11:09         ` Jeff Layton
@ 2025-10-17 23:15           ` NeilBrown
  0 siblings, 0 replies; 23+ messages in thread
From: NeilBrown @ 2025-10-17 23:15 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Chuck Lever, Mike Snitzer, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Fri, 17 Oct 2025, Jeff Layton wrote:
> On Fri, 2025-10-17 at 17:22 +1100, NeilBrown wrote:
> > On Thu, 16 Oct 2025, Jeff Layton wrote:
> > > 
> > > Somewhat related question:
> > > 
> > > Since we're on track to deprecate NFSv2 support soon, should we be
> > > looking at deprecating the "async" export option too? v2 was the main
> > > reason for it in the first place, after all.
> > 
> > Are we?  Is that justified?
> > 
> > We at SUSE had a customer who had some old but still perfectly
> > functional industrial equipment which wrote logs using NFSv2.
> > So we had to revert the nfs-utils changes which disabled NFSv2.
> > It would be nice if we/they didn't have to do that to the kernel was
> > well.
> > 
> > What is the down-side of continuing v2 support?  The test matrix isn't
> > that big.  Of course we need to revert the nfs-utils changes to continue
> > testing.  I'd be in favour of that anyway.
> > 
> > And async can still have a place for the hobbyist.  If a human is
> > overseeing a process and is prepared to deal with a server crash if it
> > happens, but doesn't want to be slowed down by the cost of being careful
> > just-in-case, then async is a perfectly rational choice.
> > 
> > I'm not in favour of deprecating things that work.
> > 
> > NeilBrown
> 
> I think we should. We deprecate obsolete drivers and (even CPU
> architectures!) all the time. Arnd has even started discussing
> deprecating 32-bit CPU support in the kernel altogether. Why would we
> not deprecate obsolete protocols too? The handful of people that still
> need v2 support can boot to an old kernel.

Protocols support interoperability.
I might want to run new software on new hardware while being able to
talk to old software running on old hardware.

So I think deprecating protocols requires more justification than
deprecating drivers or architectures.

> 
> Supporting NFSv2 is a maintenance burden (albeit a mostly minor one at
> this point). If we want to properly support it, it has to be tested,
> which I don't think anyone is doing currently. I only have so much time
> to spend on upstream maintenance, and I don't care to spend it
> supporting v2.
> 
> SuSE's customer using ancient equipment is an exception here, but I
> don't think that's enough to justify all of us spending time keeping v2
> limping along.

How much time do we spend?  128 patches in 10 years.

(and BTW it hasn't been SuSE for a long time - it is SUSE :-)

> 
> Do you have the same attachment to NFSv4.0? We had a discussion around
> starting to deprecate it as well. Having to support v4.0 _is_ a
> significant maintenance burden, IMO.

Good question.  My perception is that it is still used but it's use is
decreasing.  As it is so much more complex than v3, I'd be surprised if
there was any use in "industrial" space where long lifetimes are needed.

I agree that v4.0 is a greater support burden than v2, and that is a
significant consideration.  I think it is good that v2 has a config
option that defaults to 'n'.  I think a similar option for v4.0 would be
good.  That would also give a clearer picture of how intrusive v4.0
support is.

If supporting v2 really is, or becomes, a burden then removing it should
be considered.  But if it isn't a measurable burden, then I would be in
favour of keeping it.

NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-17  6:22       ` NeilBrown
  2025-10-17 11:09         ` Jeff Layton
@ 2025-10-17 13:43         ` Chuck Lever
  2025-10-17 23:19           ` NeilBrown
  1 sibling, 1 reply; 23+ messages in thread
From: Chuck Lever @ 2025-10-17 13:43 UTC (permalink / raw)
  To: NeilBrown, Jeff Layton
  Cc: Mike Snitzer, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Chuck Lever

On 10/17/25 2:22 AM, NeilBrown wrote:
> On Thu, 16 Oct 2025, Jeff Layton wrote:
>>
>> Somewhat related question:
>>
>> Since we're on track to deprecate NFSv2 support soon, should we be
>> looking at deprecating the "async" export option too? v2 was the main
>> reason for it in the first place, after all.
> 
> Are we?  Is that justified?
> 
> We at SUSE had a customer who had some old but still perfectly
> functional industrial equipment which wrote logs using NFSv2.
> So we had to revert the nfs-utils changes which disabled NFSv2.
> It would be nice if we/they didn't have to do that to the kernel was
> well.

It's difficult to make deprecation decisions like this because such
customers are rare and are unrepresented to upstream developers. But
it's clear that there are at least two interesting alternatives
for such customers:

- Stick with older releases of Linux
- Switch to a user space client implementation


> What is the down-side of continuing v2 support?  The test matrix isn't
> that big.  Of course we need to revert the nfs-utils changes to continue
> testing.  I'd be in favour of that anyway.

IMO changes to remove NFSv2 support from nfs-utils were premature. NFSv2
can still be enabled in the kernel, so nfs-utils should continue to have
full support for it.


> And async can still have a place for the hobbyist.  If a human is
> overseeing a process and is prepared to deal with a server crash if it
> happens, but doesn't want to be slowed down by the cost of being careful
> just-in-case, then async is a perfectly rational choice.

async is not a hobbyist-only setting. I am aware of production
deployments that use the setting.

As much as I hate async, it's not something the user community will
permit us to remove, so it's going to remain for a while. But as I
mentioned before, it seems to have less and less purpose as persistent
storage speeds continue to improve.


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-17 13:43         ` Chuck Lever
@ 2025-10-17 23:19           ` NeilBrown
  0 siblings, 0 replies; 23+ messages in thread
From: NeilBrown @ 2025-10-17 23:19 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Jeff Layton, Mike Snitzer, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Sat, 18 Oct 2025, Chuck Lever wrote:
> On 10/17/25 2:22 AM, NeilBrown wrote:
> > On Thu, 16 Oct 2025, Jeff Layton wrote:
> >>
> >> Somewhat related question:
> >>
> >> Since we're on track to deprecate NFSv2 support soon, should we be
> >> looking at deprecating the "async" export option too? v2 was the main
> >> reason for it in the first place, after all.
> > 
> > Are we?  Is that justified?
> > 
> > We at SUSE had a customer who had some old but still perfectly
> > functional industrial equipment which wrote logs using NFSv2.
> > So we had to revert the nfs-utils changes which disabled NFSv2.
> > It would be nice if we/they didn't have to do that to the kernel was
> > well.
> 
> It's difficult to make deprecation decisions like this because such
> customers are rare and are unrepresented to upstream developers. But
> it's clear that there are at least two interesting alternatives
> for such customers:
> 
> - Stick with older releases of Linux
> - Switch to a user space client implementation

Certainly they are options, but not without problems.
Using older software is not best-practice.
Using user-space server (I assume you mean) works as long as that
server doesn't follow our lead to discard v2 support.

> 
> 
> > What is the down-side of continuing v2 support?  The test matrix isn't
> > that big.  Of course we need to revert the nfs-utils changes to continue
> > testing.  I'd be in favour of that anyway.
> 
> IMO changes to remove NFSv2 support from nfs-utils were premature. NFSv2
> can still be enabled in the kernel, so nfs-utils should continue to have
> full support for it.
> 

Maybe I should post a patch to restore v2 support under a config option.

Thanks,
NeilBrown


> 
> > And async can still have a place for the hobbyist.  If a human is
> > overseeing a process and is prepared to deal with a server crash if it
> > happens, but doesn't want to be slowed down by the cost of being careful
> > just-in-case, then async is a perfectly rational choice.
> 
> async is not a hobbyist-only setting. I am aware of production
> deployments that use the setting.
> 
> As much as I hate async, it's not something the user community will
> permit us to remove, so it's going to remain for a while. But as I
> mentioned before, it seems to have less and less purpose as persistent
> storage speeds continue to improve.
> 
> 
> -- 
> Chuck Lever
> 


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-15 18:00   ` Chuck Lever
  2025-10-15 18:11     ` Jeff Layton
@ 2025-10-15 18:40     ` Mike Snitzer
  1 sibling, 0 replies; 23+ messages in thread
From: Mike Snitzer @ 2025-10-15 18:40 UTC (permalink / raw)
  To: Chuck Lever
  Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Wed, Oct 15, 2025 at 02:00:12PM -0400, Chuck Lever wrote:
> On 10/15/25 12:53 PM, Mike Snitzer wrote:
> > On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >>
> >> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> >> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> >> not need a subsequent COMMIT operation, saving a round trip and
> >> allowing the client to dispense with cached dirty data as soon as
> >> it receives the server's WRITE response.
> >>
> >> This patch refactors nfsd_vfs_write() to return a possibly modified
> >> stable_how value to its callers. No behavior change is expected.
> >>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >>  fs/nfsd/nfs3proc.c |  2 +-
> >>  fs/nfsd/nfs4proc.c |  2 +-
> >>  fs/nfsd/nfsproc.c  |  3 ++-
> >>  fs/nfsd/vfs.c      | 11 ++++++-----
> >>  fs/nfsd/vfs.h      |  6 ++++--
> >>  fs/nfsd/xdr3.h     |  2 +-
> >>  6 files changed, 15 insertions(+), 11 deletions(-)
> >>
> >> Here's what I had in mind, based on a patch I did a year ago but
> >> never posted.
> >>
> >> If nfsd_vfs_write() promotes an NFS_UNSTABLE write to NFS_FILE_SYNC,
> >> it can now set *stable_how and the WRITE encoders will return the
> >> updated value to the client.
> >>
> >>
> >> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> >> index b6d03e1ef5f7..ad14b34583bb 100644
> >> --- a/fs/nfsd/nfs3proc.c
> >> +++ b/fs/nfsd/nfs3proc.c
> >> @@ -236,7 +236,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
> >>  	resp->committed = argp->stable;
> >>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> >>  				  &argp->payload, &cnt,
> >> -				  resp->committed, resp->verf);
> >> +				  &resp->committed, resp->verf);
> >>  	resp->count = cnt;
> >>  	resp->status = nfsd3_map_status(resp->status);
> >>  	return rpc_success;
> >> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> >> index 7f7e6bb23a90..2222bb283baf 100644
> >> --- a/fs/nfsd/nfs4proc.c
> >> +++ b/fs/nfsd/nfs4proc.c
> >> @@ -1285,7 +1285,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >>  	write->wr_how_written = write->wr_stable_how;
> >>  	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
> >>  				write->wr_offset, &write->wr_payload,
> >> -				&cnt, write->wr_how_written,
> >> +				&cnt, &write->wr_how_written,
> >>  				(__be32 *)write->wr_verifier.data);
> >>  	nfsd_file_put(nf);
> >>  
> >> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> >> index 8f71f5748c75..706401ed6f8d 100644
> >> --- a/fs/nfsd/nfsproc.c
> >> +++ b/fs/nfsd/nfsproc.c
> >> @@ -251,6 +251,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
> >>  	struct nfsd_writeargs *argp = rqstp->rq_argp;
> >>  	struct nfsd_attrstat *resp = rqstp->rq_resp;
> >>  	unsigned long cnt = argp->len;
> >> +	u32 committed = NFS_DATA_SYNC;
> >>  
> >>  	dprintk("nfsd: WRITE    %s %u bytes at %d\n",
> >>  		SVCFH_fmt(&argp->fh),
> >> @@ -258,7 +259,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
> >>  
> >>  	fh_copy(&resp->fh, &argp->fh);
> >>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> >> -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
> >> +				  &argp->payload, &cnt, &committed, NULL);
> >>  	if (resp->status == nfs_ok)
> >>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
> >>  	else if (resp->status == nfserr_jukebox)
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index f537a7b4ee01..8b2dc7a88aab 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -1262,7 +1262,7 @@ static int wait_for_concurrent_writes(struct file *file)
> >>   * @offset: Byte offset of start
> >>   * @payload: xdr_buf containing the write payload
> >>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
> >> - * @stable: An NFS stable_how value
> >> + * @stable_how: IN: Client's requested stable_how, OUT: Actual stable_how
> >>   * @verf: NFS WRITE verifier
> >>   *
> >>   * Upon return, caller must invoke fh_put on @fhp.
> >> @@ -1274,11 +1274,12 @@ __be32
> >>  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>  	       struct nfsd_file *nf, loff_t offset,
> >>  	       const struct xdr_buf *payload, unsigned long *cnt,
> >> -	       int stable, __be32 *verf)
> >> +	       u32 *stable_how, __be32 *verf)
> >>  {
> >>  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
> >>  	struct file		*file = nf->nf_file;
> >>  	struct super_block	*sb = file_inode(file)->i_sb;
> >> +	u32			stable = *stable_how;
> >>  	struct kiocb		kiocb;
> >>  	struct svc_export	*exp;
> >>  	struct iov_iter		iter;
> > 
> > Seems we need this instead here?
> > 
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index 5e5e5187d2e5..d0c89f8fdb57 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -1479,7 +1479,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
> >  	struct file		*file = nf->nf_file;
> >  	struct super_block	*sb = file_inode(file)->i_sb;
> > -	u32			stable = *stable_how;
> > +	u32			stable;
> >  	struct kiocb		kiocb;
> >  	struct svc_export	*exp;
> >  	errseq_t		since;
> > @@ -1511,7 +1511,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  	exp = fhp->fh_export;
> >  
> >  	if (!EX_ISSYNC(exp))
> > -		stable = NFS_UNSTABLE;
> > +		*stable_how = NFS_UNSTABLE;
> > +	stable = *stable_how;
> >  	init_sync_kiocb(&kiocb, file);
> >  	kiocb.ki_pos = offset;
> >  	if (stable && !fhp->fh_use_wgather)
> > 
> > Otherwise, isn't there potential for nfsd_vfs_write's NFS_UNSTABLE
> > override to cause a client set stable_how, that was set to something
> > other than NFS_UNSTABLE, to silently _not_ be respected by NFSD? But
> > client could assume COMMIT not needed? And does this then elevate this
> > patch to be a stable@ fix? (no pun intended).
> > 
> > If not, please help me understand why.
> 
> The protocol permits an NFS server to change the stable_how field in a
> WRITE response as follows:
> 
> UNSTABLE  -> DATA_SYNC
> UNSTABLE  -> FILE_SYNC
> DATA_SYNC -> FILE_SYNC
> 
> It forbids the reverse direction. A client that asks for a FILE_SYNC
> WRITE MUST get a FILE_SYNC reply.
> 
> What the EX_ISSYNC test is doing is looking for the "async" export
> option. When that option is specified, internally NFSD converts all
> WRITEs, including FILE_SYNC WRITEs, to UNSTABLE. It then complies with
> the protocol by not telling the client about this invalid change and
> defies the protocol by not ensuring FILE_SYNC WRITEs are persisted
> before replying. See exports(5).
> 
> In these cases, each WRITE response still reflects what the client
> requested, but it does not reflect what the server actually did.
> 
> We tell anyone who will listen (and even those who won't) never to use
> the "async" export option because of the silent data corruption risk it
> introduces. But it goes faster than using the fully protocol-compliant
> "sync" export option, so people use it anyway.

Thanks for all of that, very helpful!

FYI, I'm now actively working on updating the NFSD Direct WRITE
support to rebase on your stable_how patch.

Mike

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-13 19:01 [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
                   ` (2 preceding siblings ...)
  2025-10-15 16:53 ` Mike Snitzer
@ 2025-10-15 22:13 ` Mike Snitzer
  2025-10-16 16:10   ` Jeff Layton
  2025-10-17 14:13   ` Chuck Lever
  2025-10-17  4:08 ` [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Christoph Hellwig
  4 siblings, 2 replies; 23+ messages in thread
From: Mike Snitzer @ 2025-10-15 22:13 UTC (permalink / raw)
  To: Chuck Lever, Jeff Layton
  Cc: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Chuck Lever

If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
middle and end as needed. The large middle extent is DIO-aligned and
the start and/or end are misaligned. Synchronous buffered IO (with
preference towards using DONTCACHE) is used for the misaligned extents
and O_DIRECT is used for the middle DIO-aligned extent.

nfsd_issue_write_dio() promotes @stable_how to NFS_FILE_SYNC, which
allows the client to drop its dirty data and avoid needing an extra
COMMIT operation.

If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
invalidate the page cache on behalf of the DIO WRITE, then
nfsd_issue_write_dio() will fall back to using buffered IO.

These changes served as the original starting point for the NFS
client's misaligned O_DIRECT support that landed with commit
c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and
WRITE"). But NFSD's support is simpler because it currently doesn't
use AIO completion.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 fs/nfsd/debugfs.c |   1 +
 fs/nfsd/trace.h   |   1 +
 fs/nfsd/vfs.c     | 218 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 215 insertions(+), 5 deletions(-)

Changes since v2:
- rebased ontop of "[PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients"
- set both IOCB_DSYNC and IOCB_SYNC (rather than just IOCB_SYNC) in nfsd_issue_write_dio()
- update nfsd_issue_write_dio() to promote @stable_how to NFS_FILE_SYNC
- push call to trace_nfsd_write_direct down from nfsd_direct_write to nfsd_issue_write_dio 
- fix comment block style to have naked '/*' on first line

diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index 00eb1ecef6ac..7f44689e0a53 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
 	switch (val) {
 	case NFSD_IO_BUFFERED:
 	case NFSD_IO_DONTCACHE:
+	case NFSD_IO_DIRECT:
 		nfsd_io_cache_write = val;
 		break;
 	default:
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index bfd41236aff2..ad74439d0105 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
 DEFINE_NFSD_IO_EVENT(read_done);
 DEFINE_NFSD_IO_EVENT(write_start);
 DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_direct);
 DEFINE_NFSD_IO_EVENT(write_io_done);
 DEFINE_NFSD_IO_EVENT(write_done);
 DEFINE_NFSD_IO_EVENT(commit_start);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3dee33a7233..ba7cb698ac68 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1253,6 +1253,210 @@ static int wait_for_concurrent_writes(struct file *file)
 	return err;
 }
 
+struct nfsd_write_dio {
+	ssize_t	start_len;	/* Length for misaligned first extent */
+	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
+	ssize_t	end_len;	/* Length for misaligned last extent */
+};
+
+static bool
+nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
+		       struct nfsd_file *nf, struct nfsd_write_dio *write_dio)
+{
+	const u32 dio_blocksize = nf->nf_dio_offset_align;
+	loff_t start_end, orig_end, middle_end;
+
+	if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
+		return false;
+	if (unlikely(dio_blocksize > PAGE_SIZE))
+		return false;
+	if (unlikely(len < dio_blocksize))
+		return false;
+
+	start_end = round_up(offset, dio_blocksize);
+	orig_end = offset + len;
+	middle_end = round_down(orig_end, dio_blocksize);
+
+	write_dio->start_len = start_end - offset;
+	write_dio->middle_len = middle_end - start_end;
+	write_dio->end_len = orig_end - middle_end;
+
+	return true;
+}
+
+static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
+		unsigned int addr_mask, unsigned int len_mask)
+{
+	const struct bio_vec *bvec = i->bvec;
+	size_t skip = i->iov_offset;
+	size_t size = i->count;
+
+	if (size & len_mask)
+		return false;
+	do {
+		size_t len = bvec->bv_len;
+
+		if (len > size)
+			len = size;
+		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+			return false;
+		bvec++;
+		size -= len;
+		skip = 0;
+	} while (size);
+
+	return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @write_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
+			   struct bio_vec *rq_bvec, unsigned int nvecs,
+			   unsigned long cnt, struct nfsd_write_dio *write_dio,
+			   struct nfsd_file *nf)
+{
+	int n_iters = 0;
+	struct iov_iter *iters = *iterp;
+
+	/* Setup misaligned start? */
+	if (write_dio->start_len) {
+		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+		iters[n_iters].count = write_dio->start_len;
+		iter_is_dio_aligned[n_iters] = false;
+		++n_iters;
+	}
+
+	/* Setup DIO-aligned middle */
+	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+	if (write_dio->start_len)
+		iov_iter_advance(&iters[n_iters], write_dio->start_len);
+	iters[n_iters].count -= write_dio->end_len;
+	iter_is_dio_aligned[n_iters] =
+		nfsd_iov_iter_aligned_bvec(&iters[n_iters],
+				nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1);
+	if (unlikely(!iter_is_dio_aligned[n_iters]))
+		return 0; /* no DIO-aligned IO possible */
+	++n_iters;
+
+	/* Setup misaligned end? */
+	if (write_dio->end_len) {
+		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
+		iov_iter_advance(&iters[n_iters],
+				 write_dio->start_len + write_dio->middle_len);
+		iter_is_dio_aligned[n_iters] = false;
+		++n_iters;
+	}
+
+	return n_iters;
+}
+
+static int
+nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file,
+		    unsigned int nvecs, unsigned long *cnt,
+		    struct kiocb *kiocb)
+{
+	struct iov_iter iter;
+	int host_err;
+
+	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+	host_err = vfs_iocb_iter_write(file, kiocb, &iter);
+	if (host_err < 0)
+		return host_err;
+	*cnt = host_err;
+
+	return 0;
+}
+
+static int
+nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+		     u32 *stable_how, unsigned int nvecs, unsigned long *cnt,
+		     struct kiocb *kiocb, struct nfsd_write_dio *write_dio)
+{
+	struct file *file = nf->nf_file;
+	bool iter_is_dio_aligned[3];
+	struct iov_iter iter_stack[3];
+	struct iov_iter *iter = iter_stack;
+	unsigned int n_iters = 0;
+	unsigned long in_count = *cnt;
+	loff_t in_offset = kiocb->ki_pos;
+	ssize_t host_err;
+
+	n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
+				rqstp->rq_bvec, nvecs, *cnt, write_dio, nf);
+	if (unlikely(!n_iters))
+		return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb);
+
+	trace_nfsd_write_direct(rqstp, fhp, in_offset, in_count);
+
+	/*
+	 * Any buffered IO issued here will be misaligned, use
+	 * sync IO to ensure it has completed before returning.
+	 * Also update @stable_how to avoid need for COMMIT.
+	 */
+	kiocb->ki_flags |= (IOCB_DSYNC|IOCB_SYNC);
+	*stable_how = NFS_FILE_SYNC;
+
+	*cnt = 0;
+	for (int i = 0; i < n_iters; i++) {
+		if (iter_is_dio_aligned[i])
+			kiocb->ki_flags |= IOCB_DIRECT;
+		else
+			kiocb->ki_flags &= ~IOCB_DIRECT;
+
+		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
+		if (host_err < 0) {
+			/*
+			 * VFS will return -ENOTBLK if DIO WRITE fails to
+			 * invalidate the page cache. Retry using buffered IO.
+			 */
+			if (unlikely(host_err == -ENOTBLK)) {
+				kiocb->ki_flags &= ~IOCB_DIRECT;
+				*cnt = in_count;
+				kiocb->ki_pos = in_offset;
+				return nfsd_buffered_write(rqstp, file,
+							   nvecs, cnt, kiocb);
+			} else if (unlikely(host_err == -EINVAL)) {
+				struct inode *inode = d_inode(fhp->fh_dentry);
+
+				pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
+						    inode->i_sb->s_id, inode->i_ino);
+				host_err = -ESERVERFAULT;
+			}
+			return host_err;
+		}
+		*cnt += host_err;
+		if (host_err < iter[i].count) /* partial write? */
+			break;
+	}
+
+	return 0;
+}
+
+static noinline_for_stack int
+nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  struct nfsd_file *nf, u32 *stable_how, unsigned int nvecs,
+		  unsigned long *cnt, struct kiocb *kiocb)
+{
+	struct nfsd_write_dio write_dio;
+
+	/*
+	 * Check if IOCB_DONTCACHE can be used when issuing buffered IO;
+	 * if so, set it to preserve intent of NFSD_IO_DIRECT (it will
+	 * be ignored for any DIO issued here).
+	 */
+	if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
+		kiocb->ki_flags |= IOCB_DONTCACHE;
+
+	if (nfsd_is_write_dio_possible(kiocb->ki_pos, *cnt, nf, &write_dio))
+		return nfsd_issue_write_dio(rqstp, fhp, nf, stable_how, nvecs,
+					    cnt, kiocb, &write_dio);
+
+	return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb);
+}
+
 /**
  * nfsd_vfs_write - write data to an already-open file
  * @rqstp: RPC execution context
@@ -1281,7 +1485,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	u32			stable = *stable_how;
 	struct kiocb		kiocb;
 	struct svc_export	*exp;
-	struct iov_iter		iter;
 	errseq_t		since;
 	__be32			nfserr;
 	int			host_err;
@@ -1318,25 +1521,30 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		kiocb.ki_flags |= IOCB_DSYNC;
 
 	nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
-	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+
 	since = READ_ONCE(file->f_wb_err);
 	if (verf)
 		nfsd_copy_write_verifier(verf, nn);
 
 	switch (nfsd_io_cache_write) {
-	case NFSD_IO_BUFFERED:
+	case NFSD_IO_DIRECT:
+		host_err = nfsd_direct_write(rqstp, fhp, nf, stable_how,
+					     nvecs, cnt, &kiocb);
+		stable = *stable_how;
 		break;
 	case NFSD_IO_DONTCACHE:
 		if (file->f_op->fop_flags & FOP_DONTCACHE)
 			kiocb.ki_flags |= IOCB_DONTCACHE;
+		fallthrough; /* must call nfsd_buffered_write */
+	case NFSD_IO_BUFFERED:
+		host_err = nfsd_buffered_write(rqstp, file,
+					       nvecs, cnt, &kiocb);
 		break;
 	}
-	host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
 	if (host_err < 0) {
 		commit_reset_write_verifier(nn, rqstp, host_err);
 		goto out_nfserr;
 	}
-	*cnt = host_err;
 	nfsd_stats_io_write_add(nn, exp, *cnt);
 	fsnotify_modify(file);
 	host_err = filemap_check_wb_err(file->f_mapping, since);
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-15 22:13 ` [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
@ 2025-10-16 16:10   ` Jeff Layton
  2025-10-17 14:13   ` Chuck Lever
  1 sibling, 0 replies; 23+ messages in thread
From: Jeff Layton @ 2025-10-16 16:10 UTC (permalink / raw)
  To: Mike Snitzer, Chuck Lever
  Cc: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Chuck Lever

On Wed, 2025-10-15 at 18:13 -0400, Mike Snitzer wrote:
> If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
> middle and end as needed. The large middle extent is DIO-aligned and
> the start and/or end are misaligned. Synchronous buffered IO (with
> preference towards using DONTCACHE) is used for the misaligned extents
> and O_DIRECT is used for the middle DIO-aligned extent.
> 
> nfsd_issue_write_dio() promotes @stable_how to NFS_FILE_SYNC, which
> allows the client to drop its dirty data and avoid needing an extra
> COMMIT operation.
> 
> If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
> invalidate the page cache on behalf of the DIO WRITE, then
> nfsd_issue_write_dio() will fall back to using buffered IO.
> 
> These changes served as the original starting point for the NFS
> client's misaligned O_DIRECT support that landed with commit
> c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and
> WRITE"). But NFSD's support is simpler because it currently doesn't
> use AIO completion.
> 
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
>  fs/nfsd/debugfs.c |   1 +
>  fs/nfsd/trace.h   |   1 +
>  fs/nfsd/vfs.c     | 218 ++++++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 215 insertions(+), 5 deletions(-)
> 
> Changes since v2:
> - rebased ontop of "[PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients"
> - set both IOCB_DSYNC and IOCB_SYNC (rather than just IOCB_SYNC) in nfsd_issue_write_dio()
> - update nfsd_issue_write_dio() to promote @stable_how to NFS_FILE_SYNC
> - push call to trace_nfsd_write_direct down from nfsd_direct_write to nfsd_issue_write_dio 
> - fix comment block style to have naked '/*' on first line
> 
> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> index 00eb1ecef6ac..7f44689e0a53 100644
> --- a/fs/nfsd/debugfs.c
> +++ b/fs/nfsd/debugfs.c
> @@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
>  	switch (val) {
>  	case NFSD_IO_BUFFERED:
>  	case NFSD_IO_DONTCACHE:
> +	case NFSD_IO_DIRECT:
>  		nfsd_io_cache_write = val;
>  		break;
>  	default:
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index bfd41236aff2..ad74439d0105 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
>  DEFINE_NFSD_IO_EVENT(read_done);
>  DEFINE_NFSD_IO_EVENT(write_start);
>  DEFINE_NFSD_IO_EVENT(write_opened);
> +DEFINE_NFSD_IO_EVENT(write_direct);
>  DEFINE_NFSD_IO_EVENT(write_io_done);
>  DEFINE_NFSD_IO_EVENT(write_done);
>  DEFINE_NFSD_IO_EVENT(commit_start);
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a3dee33a7233..ba7cb698ac68 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1253,6 +1253,210 @@ static int wait_for_concurrent_writes(struct file *file)
>  	return err;
>  }
>  
> +struct nfsd_write_dio {
> +	ssize_t	start_len;	/* Length for misaligned first extent */
> +	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
> +	ssize_t	end_len;	/* Length for misaligned last extent */
> +};
> +
> +static bool
> +nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
> +		       struct nfsd_file *nf, struct nfsd_write_dio *write_dio)
> +{
> +	const u32 dio_blocksize = nf->nf_dio_offset_align;
> +	loff_t start_end, orig_end, middle_end;
> +
> +	if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
> +		return false;
> +	if (unlikely(dio_blocksize > PAGE_SIZE))
> +		return false;
> +	if (unlikely(len < dio_blocksize))
> +		return false;
> +
> +	start_end = round_up(offset, dio_blocksize);
> +	orig_end = offset + len;
> +	middle_end = round_down(orig_end, dio_blocksize);
> +
> +	write_dio->start_len = start_end - offset;
> +	write_dio->middle_len = middle_end - start_end;
> +	write_dio->end_len = orig_end - middle_end;
> +
> +	return true;
> +}
> +
> +static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
> +		unsigned int addr_mask, unsigned int len_mask)
> +{
> +	const struct bio_vec *bvec = i->bvec;
> +	size_t skip = i->iov_offset;
> +	size_t size = i->count;
> +
> +	if (size & len_mask)
> +		return false;
> +	do {
> +		size_t len = bvec->bv_len;
> +
> +		if (len > size)
> +			len = size;
> +		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
> +			return false;
> +		bvec++;
> +		size -= len;
> +		skip = 0;
> +	} while (size);
> +
> +	return true;
> +}
> +
> +/*
> + * Setup as many as 3 iov_iter based on extents described by @write_dio.
> + * Returns the number of iov_iter that were setup.
> + */
> +static int
> +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
> +			   struct bio_vec *rq_bvec, unsigned int nvecs,
> +			   unsigned long cnt, struct nfsd_write_dio *write_dio,
> +			   struct nfsd_file *nf)
> +{
> +	int n_iters = 0;
> +	struct iov_iter *iters = *iterp;
> +
> +	/* Setup misaligned start? */
> +	if (write_dio->start_len) {
> +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> +		iters[n_iters].count = write_dio->start_len;
> +		iter_is_dio_aligned[n_iters] = false;
> +		++n_iters;
> +	}
> +
> +	/* Setup DIO-aligned middle */
> +	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> +	if (write_dio->start_len)
> +		iov_iter_advance(&iters[n_iters], write_dio->start_len);
> +	iters[n_iters].count -= write_dio->end_len;
> +	iter_is_dio_aligned[n_iters] =
> +		nfsd_iov_iter_aligned_bvec(&iters[n_iters],
> +				nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1);
> +	if (unlikely(!iter_is_dio_aligned[n_iters]))
> +		return 0; /* no DIO-aligned IO possible */
> +	++n_iters;
> +
> +	/* Setup misaligned end? */
> +	if (write_dio->end_len) {
> +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> +		iov_iter_advance(&iters[n_iters],
> +				 write_dio->start_len + write_dio->middle_len);
> +		iter_is_dio_aligned[n_iters] = false;
> +		++n_iters;
> +	}
> +
> +	return n_iters;
> +}
> +
> +static int
> +nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file,
> +		    unsigned int nvecs, unsigned long *cnt,
> +		    struct kiocb *kiocb)
> +{
> +	struct iov_iter iter;
> +	int host_err;
> +
> +	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> +	host_err = vfs_iocb_iter_write(file, kiocb, &iter);
> +	if (host_err < 0)
> +		return host_err;
> +	*cnt = host_err;
> +
> +	return 0;
> +}
> +
> +static int
> +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
> +		     u32 *stable_how, unsigned int nvecs, unsigned long *cnt,
> +		     struct kiocb *kiocb, struct nfsd_write_dio *write_dio)
> +{
> +	struct file *file = nf->nf_file;
> +	bool iter_is_dio_aligned[3];
> +	struct iov_iter iter_stack[3];
> +	struct iov_iter *iter = iter_stack;
> +	unsigned int n_iters = 0;
> +	unsigned long in_count = *cnt;
> +	loff_t in_offset = kiocb->ki_pos;
> +	ssize_t host_err;
> +
> +	n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
> +				rqstp->rq_bvec, nvecs, *cnt, write_dio, nf);
> +	if (unlikely(!n_iters))
> +		return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb);
> +
> +	trace_nfsd_write_direct(rqstp, fhp, in_offset, in_count);
> +
> +	/*
> +	 * Any buffered IO issued here will be misaligned, use
> +	 * sync IO to ensure it has completed before returning.
> +	 * Also update @stable_how to avoid need for COMMIT.
> +	 */
> +	kiocb->ki_flags |= (IOCB_DSYNC|IOCB_SYNC);
> +	*stable_how = NFS_FILE_SYNC;
> +
> +	*cnt = 0;
> +	for (int i = 0; i < n_iters; i++) {
> +		if (iter_is_dio_aligned[i])
> +			kiocb->ki_flags |= IOCB_DIRECT;
> +		else
> +			kiocb->ki_flags &= ~IOCB_DIRECT;
> +
> +		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
> +		if (host_err < 0) {
> +			/*
> +			 * VFS will return -ENOTBLK if DIO WRITE fails to
> +			 * invalidate the page cache. Retry using buffered IO.
> +			 */
> +			if (unlikely(host_err == -ENOTBLK)) {
> +				kiocb->ki_flags &= ~IOCB_DIRECT;
> +				*cnt = in_count;
> +				kiocb->ki_pos = in_offset;
> +				return nfsd_buffered_write(rqstp, file,
> +							   nvecs, cnt, kiocb);
> +			} else if (unlikely(host_err == -EINVAL)) {
> +				struct inode *inode = d_inode(fhp->fh_dentry);
> +
> +				pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
> +						    inode->i_sb->s_id, inode->i_ino);
> +				host_err = -ESERVERFAULT;
> +			}
> +			return host_err;
> +		}
> +		*cnt += host_err;
> +		if (host_err < iter[i].count) /* partial write? */
> +			break;
> +	}
> +
> +	return 0;
> +}
> +
> +static noinline_for_stack int
> +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> +		  struct nfsd_file *nf, u32 *stable_how, unsigned int nvecs,
> +		  unsigned long *cnt, struct kiocb *kiocb)
> +{
> +	struct nfsd_write_dio write_dio;
> +
> +	/*
> +	 * Check if IOCB_DONTCACHE can be used when issuing buffered IO;
> +	 * if so, set it to preserve intent of NFSD_IO_DIRECT (it will
> +	 * be ignored for any DIO issued here).
> +	 */
> +	if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
> +		kiocb->ki_flags |= IOCB_DONTCACHE;
> +
> +	if (nfsd_is_write_dio_possible(kiocb->ki_pos, *cnt, nf, &write_dio))
> +		return nfsd_issue_write_dio(rqstp, fhp, nf, stable_how, nvecs,
> +					    cnt, kiocb, &write_dio);
> +
> +	return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb);
> +}
> +
>  /**
>   * nfsd_vfs_write - write data to an already-open file
>   * @rqstp: RPC execution context
> @@ -1281,7 +1485,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	u32			stable = *stable_how;
>  	struct kiocb		kiocb;
>  	struct svc_export	*exp;
> -	struct iov_iter		iter;
>  	errseq_t		since;
>  	__be32			nfserr;
>  	int			host_err;
> @@ -1318,25 +1521,30 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		kiocb.ki_flags |= IOCB_DSYNC;
>  
>  	nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
> -	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> +
>  	since = READ_ONCE(file->f_wb_err);
>  	if (verf)
>  		nfsd_copy_write_verifier(verf, nn);
>  
>  	switch (nfsd_io_cache_write) {
> -	case NFSD_IO_BUFFERED:
> +	case NFSD_IO_DIRECT:
> +		host_err = nfsd_direct_write(rqstp, fhp, nf, stable_how,
> +					     nvecs, cnt, &kiocb);
> +		stable = *stable_how;
>  		break;
>  	case NFSD_IO_DONTCACHE:
>  		if (file->f_op->fop_flags & FOP_DONTCACHE)
>  			kiocb.ki_flags |= IOCB_DONTCACHE;
> +		fallthrough; /* must call nfsd_buffered_write */
> +	case NFSD_IO_BUFFERED:
> +		host_err = nfsd_buffered_write(rqstp, file,
> +					       nvecs, cnt, &kiocb);
>  		break;
>  	}
> -	host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
>  	if (host_err < 0) {
>  		commit_reset_write_verifier(nn, rqstp, host_err);
>  		goto out_nfserr;
>  	}
> -	*cnt = host_err;
>  	nfsd_stats_io_write_add(nn, exp, *cnt);
>  	fsnotify_modify(file);
>  	host_err = filemap_check_wb_err(file->f_mapping, since);

LGTM

Reviewed-by: Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-15 22:13 ` [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
  2025-10-16 16:10   ` Jeff Layton
@ 2025-10-17 14:13   ` Chuck Lever
  2025-10-17 21:03     ` Mike Snitzer
  2025-10-17 23:23     ` NeilBrown
  1 sibling, 2 replies; 23+ messages in thread
From: Chuck Lever @ 2025-10-17 14:13 UTC (permalink / raw)
  To: Mike Snitzer, Jeff Layton
  Cc: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	Chuck Lever

On 10/15/25 6:13 PM, Mike Snitzer wrote:
> If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
> middle and end as needed. The large middle extent is DIO-aligned and
> the start and/or end are misaligned. Synchronous buffered IO (with
> preference towards using DONTCACHE) is used for the misaligned extents
> and O_DIRECT is used for the middle DIO-aligned extent.
> 
> nfsd_issue_write_dio() promotes @stable_how to NFS_FILE_SYNC, which
> allows the client to drop its dirty data and avoid needing an extra
> COMMIT operation.
> 
> If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
> invalidate the page cache on behalf of the DIO WRITE, then
> nfsd_issue_write_dio() will fall back to using buffered IO.
> 
> These changes served as the original starting point for the NFS
> client's misaligned O_DIRECT support that landed with commit
> c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and
> WRITE"). But NFSD's support is simpler because it currently doesn't
> use AIO completion.
> 
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>

Handful of nits, and one interesting question. Please send a v4 to
address the two comment issues below.

Question: How is NFSD supposed to act if the administrator sets the
async export option and NFSD_IO_DIRECT at the same time?

- Direct I/O dispatched but not waited for ?

- Fall back to DONTCACHE ?

- Ignore the async setting (that's what it appears to do now) ?

- Something else ?

IMO async + DIRECT seems like a disaster waiting to happen. It should
be pretty easy to push the NFS server over a cliff because there is no
connection between clients submitting writes and data getting persisted.
Dirty data will just pile up in the server's memory because it can't get
flushed fast enough.

Synchronous direct I/O has a very strong connection between how fast
persistent storage can go and how fast clients can push more writes.


> ---
>  fs/nfsd/debugfs.c |   1 +
>  fs/nfsd/trace.h   |   1 +
>  fs/nfsd/vfs.c     | 218 ++++++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 215 insertions(+), 5 deletions(-)
> 
> Changes since v2:
> - rebased ontop of "[PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients"
> - set both IOCB_DSYNC and IOCB_SYNC (rather than just IOCB_SYNC) in nfsd_issue_write_dio()
> - update nfsd_issue_write_dio() to promote @stable_how to NFS_FILE_SYNC
> - push call to trace_nfsd_write_direct down from nfsd_direct_write to nfsd_issue_write_dio 
> - fix comment block style to have naked '/*' on first line
> 
> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> index 00eb1ecef6ac..7f44689e0a53 100644
> --- a/fs/nfsd/debugfs.c
> +++ b/fs/nfsd/debugfs.c
> @@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
>  	switch (val) {
>  	case NFSD_IO_BUFFERED:
>  	case NFSD_IO_DONTCACHE:
> +	case NFSD_IO_DIRECT:
>  		nfsd_io_cache_write = val;
>  		break;
>  	default:
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index bfd41236aff2..ad74439d0105 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
>  DEFINE_NFSD_IO_EVENT(read_done);
>  DEFINE_NFSD_IO_EVENT(write_start);
>  DEFINE_NFSD_IO_EVENT(write_opened);
> +DEFINE_NFSD_IO_EVENT(write_direct);
>  DEFINE_NFSD_IO_EVENT(write_io_done);
>  DEFINE_NFSD_IO_EVENT(write_done);
>  DEFINE_NFSD_IO_EVENT(commit_start);
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a3dee33a7233..ba7cb698ac68 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1253,6 +1253,210 @@ static int wait_for_concurrent_writes(struct file *file)
>  	return err;
>  }
>  
> +struct nfsd_write_dio {
> +	ssize_t	start_len;	/* Length for misaligned first extent */
> +	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
> +	ssize_t	end_len;	/* Length for misaligned last extent */
> +};
> +
> +static bool
> +nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
> +		       struct nfsd_file *nf, struct nfsd_write_dio *write_dio)
> +{
> +	const u32 dio_blocksize = nf->nf_dio_offset_align;
> +	loff_t start_end, orig_end, middle_end;
> +
> +	if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
> +		return false;
> +	if (unlikely(dio_blocksize > PAGE_SIZE))
> +		return false;
> +	if (unlikely(len < dio_blocksize))
> +		return false;
> +
> +	start_end = round_up(offset, dio_blocksize);
> +	orig_end = offset + len;
> +	middle_end = round_down(orig_end, dio_blocksize);
> +
> +	write_dio->start_len = start_end - offset;
> +	write_dio->middle_len = middle_end - start_end;
> +	write_dio->end_len = orig_end - middle_end;
> +
> +	return true;
> +}
> +
> +static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
> +		unsigned int addr_mask, unsigned int len_mask)

IIRC you had mentioned that there is possibly a generic utility
function that does this that we could adopt here. If we are going
to keep this one in spite of having a generic, it needs a comment
explaining why we're not using the generic version.


> +{
> +	const struct bio_vec *bvec = i->bvec;
> +	size_t skip = i->iov_offset;
> +	size_t size = i->count;
> +
> +	if (size & len_mask)
> +		return false;
> +	do {
> +		size_t len = bvec->bv_len;
> +
> +		if (len > size)
> +			len = size;
> +		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
> +			return false;
> +		bvec++;
> +		size -= len;
> +		skip = 0;
> +	} while (size);
> +
> +	return true;
> +}
> +
> +/*
> + * Setup as many as 3 iov_iter based on extents described by @write_dio.
> + * Returns the number of iov_iter that were setup.
> + */
> +static int
> +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
> +			   struct bio_vec *rq_bvec, unsigned int nvecs,
> +			   unsigned long cnt, struct nfsd_write_dio *write_dio,
> +			   struct nfsd_file *nf)
> +{
> +	int n_iters = 0;
> +	struct iov_iter *iters = *iterp;
> +
> +	/* Setup misaligned start? */
> +	if (write_dio->start_len) {
> +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> +		iters[n_iters].count = write_dio->start_len;
> +		iter_is_dio_aligned[n_iters] = false;
> +		++n_iters;
> +	}
> +
> +	/* Setup DIO-aligned middle */
> +	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> +	if (write_dio->start_len)
> +		iov_iter_advance(&iters[n_iters], write_dio->start_len);
> +	iters[n_iters].count -= write_dio->end_len;
> +	iter_is_dio_aligned[n_iters] =
> +		nfsd_iov_iter_aligned_bvec(&iters[n_iters],
> +				nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1);
> +	if (unlikely(!iter_is_dio_aligned[n_iters]))
> +		return 0; /* no DIO-aligned IO possible */
> +	++n_iters;
> +
> +	/* Setup misaligned end? */
> +	if (write_dio->end_len) {
> +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> +		iov_iter_advance(&iters[n_iters],
> +				 write_dio->start_len + write_dio->middle_len);
> +		iter_is_dio_aligned[n_iters] = false;
> +		++n_iters;
> +	}
> +
> +	return n_iters;
> +}
> +
> +static int
> +nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file,
> +		    unsigned int nvecs, unsigned long *cnt,
> +		    struct kiocb *kiocb)
> +{
> +	struct iov_iter iter;
> +	int host_err;
> +
> +	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> +	host_err = vfs_iocb_iter_write(file, kiocb, &iter);
> +	if (host_err < 0)
> +		return host_err;
> +	*cnt = host_err;
> +
> +	return 0;
> +}
> +
> +static int
> +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
> +		     u32 *stable_how, unsigned int nvecs, unsigned long *cnt,
> +		     struct kiocb *kiocb, struct nfsd_write_dio *write_dio)
> +{
> +	struct file *file = nf->nf_file;
> +	bool iter_is_dio_aligned[3];
> +	struct iov_iter iter_stack[3];
> +	struct iov_iter *iter = iter_stack;
> +	unsigned int n_iters = 0;
> +	unsigned long in_count = *cnt;
> +	loff_t in_offset = kiocb->ki_pos;
> +	ssize_t host_err;
> +
> +	n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
> +				rqstp->rq_bvec, nvecs, *cnt, write_dio, nf);
> +	if (unlikely(!n_iters))
> +		return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb);
> +
> +	trace_nfsd_write_direct(rqstp, fhp, in_offset, in_count);
> +
> +	/*
> +	 * Any buffered IO issued here will be misaligned, use
> +	 * sync IO to ensure it has completed before returning.
> +	 * Also update @stable_how to avoid need for COMMIT.
> +	 */
> +	kiocb->ki_flags |= (IOCB_DSYNC|IOCB_SYNC);
> +	*stable_how = NFS_FILE_SYNC;
> +
> +	*cnt = 0;
> +	for (int i = 0; i < n_iters; i++) {
> +		if (iter_is_dio_aligned[i])
> +			kiocb->ki_flags |= IOCB_DIRECT;
> +		else
> +			kiocb->ki_flags &= ~IOCB_DIRECT;
> +
> +		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
> +		if (host_err < 0) {
> +			/*
> +			 * VFS will return -ENOTBLK if DIO WRITE fails to
> +			 * invalidate the page cache. Retry using buffered IO.
> +			 */

I'm debating with myself whether, now that NFSD is using DIO, nfserrno
should get a mapping from ENOTBLK to nfserr_serverfault or nfserr_io,
simply as a defensive measure.

I kind of like the idea that we get a warning in nfserrno: "hey, I
didn't expect ENOTBLK here" so I'm leaning towards leaving it as is
for now.


> +			if (unlikely(host_err == -ENOTBLK)) {
> +				kiocb->ki_flags &= ~IOCB_DIRECT;
> +				*cnt = in_count;
> +				kiocb->ki_pos = in_offset;
> +				return nfsd_buffered_write(rqstp, file,
> +							   nvecs, cnt, kiocb);
> +			} else if (unlikely(host_err == -EINVAL)) {
> +				struct inode *inode = d_inode(fhp->fh_dentry);
> +
> +				pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
> +						    inode->i_sb->s_id, inode->i_ino);
> +				host_err = -ESERVERFAULT;
> +			}
> +			return host_err;
> +		}
> +		*cnt += host_err;
> +		if (host_err < iter[i].count) /* partial write? */
> +			break;
> +	}
> +
> +	return 0;
> +}
> +
> +static noinline_for_stack int
> +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> +		  struct nfsd_file *nf, u32 *stable_how, unsigned int nvecs,
> +		  unsigned long *cnt, struct kiocb *kiocb)
> +{
> +	struct nfsd_write_dio write_dio;
> +
> +	/*
> +	 * Check if IOCB_DONTCACHE can be used when issuing buffered IO;
> +	 * if so, set it to preserve intent of NFSD_IO_DIRECT (it will
> +	 * be ignored for any DIO issued here).
> +	 */
> +	if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
> +		kiocb->ki_flags |= IOCB_DONTCACHE;
> +
> +	if (nfsd_is_write_dio_possible(kiocb->ki_pos, *cnt, nf, &write_dio))
> +		return nfsd_issue_write_dio(rqstp, fhp, nf, stable_how, nvecs,
> +					    cnt, kiocb, &write_dio);
> +
> +	return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb);
> +}
> +
>  /**
>   * nfsd_vfs_write - write data to an already-open file
>   * @rqstp: RPC execution context
> @@ -1281,7 +1485,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	u32			stable = *stable_how;
>  	struct kiocb		kiocb;
>  	struct svc_export	*exp;
> -	struct iov_iter		iter;
>  	errseq_t		since;
>  	__be32			nfserr;
>  	int			host_err;
> @@ -1318,25 +1521,30 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		kiocb.ki_flags |= IOCB_DSYNC;
>  
>  	nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
> -	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> +
>  	since = READ_ONCE(file->f_wb_err);
>  	if (verf)
>  		nfsd_copy_write_verifier(verf, nn);
>  
>  	switch (nfsd_io_cache_write) {
> -	case NFSD_IO_BUFFERED:
> +	case NFSD_IO_DIRECT:
> +		host_err = nfsd_direct_write(rqstp, fhp, nf, stable_how,
> +					     nvecs, cnt, &kiocb);
> +		stable = *stable_how;
>  		break;
>  	case NFSD_IO_DONTCACHE:
>  		if (file->f_op->fop_flags & FOP_DONTCACHE)
>  			kiocb.ki_flags |= IOCB_DONTCACHE;
> +		fallthrough; /* must call nfsd_buffered_write */

I'm not clear what purpose this comment serves. The fallthrough
already states what's going to happen next.


> +	case NFSD_IO_BUFFERED:
> +		host_err = nfsd_buffered_write(rqstp, file,
> +					       nvecs, cnt, &kiocb);
>  		break;
>  	}
> -	host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
>  	if (host_err < 0) {
>  		commit_reset_write_verifier(nn, rqstp, host_err);
>  		goto out_nfserr;
>  	}
> -	*cnt = host_err;
>  	nfsd_stats_io_write_add(nn, exp, *cnt);
>  	fsnotify_modify(file);
>  	host_err = filemap_check_wb_err(file->f_mapping, since);


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-17 14:13   ` Chuck Lever
@ 2025-10-17 21:03     ` Mike Snitzer
  2025-10-17 21:54       ` Chuck Lever
  2025-10-17 23:23     ` NeilBrown
  1 sibling, 1 reply; 23+ messages in thread
From: Mike Snitzer @ 2025-10-17 21:03 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Fri, Oct 17, 2025 at 10:13:14AM -0400, Chuck Lever wrote:
> On 10/15/25 6:13 PM, Mike Snitzer wrote:
> > If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
> > middle and end as needed. The large middle extent is DIO-aligned and
> > the start and/or end are misaligned. Synchronous buffered IO (with
> > preference towards using DONTCACHE) is used for the misaligned extents
> > and O_DIRECT is used for the middle DIO-aligned extent.
> > 
> > nfsd_issue_write_dio() promotes @stable_how to NFS_FILE_SYNC, which
> > allows the client to drop its dirty data and avoid needing an extra
> > COMMIT operation.
> > 
> > If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
> > invalidate the page cache on behalf of the DIO WRITE, then
> > nfsd_issue_write_dio() will fall back to using buffered IO.
> > 
> > These changes served as the original starting point for the NFS
> > client's misaligned O_DIRECT support that landed with commit
> > c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and
> > WRITE"). But NFSD's support is simpler because it currently doesn't
> > use AIO completion.
> > 
> > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> 
> Handful of nits, and one interesting question. Please send a v4 to
> address the two comment issues below.
> 
> Question: How is NFSD supposed to act if the administrator sets the
> async export option and NFSD_IO_DIRECT at the same time?
> 
> - Direct I/O dispatched but not waited for ?
> 
> - Fall back to DONTCACHE ?
> 
> - Ignore the async setting (that's what it appears to do now) ?

Yes, and while I only just really learned of the async option.. given
its unsafe nature, I feel like the right thing to do is ignore it.  My
basis for this is O_DIRECT on its own should/could serve as the basis
for performance wins.  It was never engineered to account for the
quirky hack that is async.

> 
> - Something else ?
> 
> IMO async + DIRECT seems like a disaster waiting to happen. It should
> be pretty easy to push the NFS server over a cliff because there is no
> connection between clients submitting writes and data getting persisted.
> Dirty data will just pile up in the server's memory because it can't get
> flushed fast enough.
> 
> Synchronous direct I/O has a very strong connection between how fast
> persistent storage can go and how fast clients can push more writes.

Exactly right.

> > ---
> >  fs/nfsd/debugfs.c |   1 +
> >  fs/nfsd/trace.h   |   1 +
> >  fs/nfsd/vfs.c     | 218 ++++++++++++++++++++++++++++++++++++++++++++--
> >  3 files changed, 215 insertions(+), 5 deletions(-)
> > 
> > Changes since v2:
> > - rebased ontop of "[PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients"
> > - set both IOCB_DSYNC and IOCB_SYNC (rather than just IOCB_SYNC) in nfsd_issue_write_dio()
> > - update nfsd_issue_write_dio() to promote @stable_how to NFS_FILE_SYNC
> > - push call to trace_nfsd_write_direct down from nfsd_direct_write to nfsd_issue_write_dio 
> > - fix comment block style to have naked '/*' on first line
> > 
> > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > index 00eb1ecef6ac..7f44689e0a53 100644
> > --- a/fs/nfsd/debugfs.c
> > +++ b/fs/nfsd/debugfs.c
> > @@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
> >  	switch (val) {
> >  	case NFSD_IO_BUFFERED:
> >  	case NFSD_IO_DONTCACHE:
> > +	case NFSD_IO_DIRECT:
> >  		nfsd_io_cache_write = val;
> >  		break;
> >  	default:
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index bfd41236aff2..ad74439d0105 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
> >  DEFINE_NFSD_IO_EVENT(read_done);
> >  DEFINE_NFSD_IO_EVENT(write_start);
> >  DEFINE_NFSD_IO_EVENT(write_opened);
> > +DEFINE_NFSD_IO_EVENT(write_direct);
> >  DEFINE_NFSD_IO_EVENT(write_io_done);
> >  DEFINE_NFSD_IO_EVENT(write_done);
> >  DEFINE_NFSD_IO_EVENT(commit_start);
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a3dee33a7233..ba7cb698ac68 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -1253,6 +1253,210 @@ static int wait_for_concurrent_writes(struct file *file)
> >  	return err;
> >  }
> >  
> > +struct nfsd_write_dio {
> > +	ssize_t	start_len;	/* Length for misaligned first extent */
> > +	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
> > +	ssize_t	end_len;	/* Length for misaligned last extent */
> > +};
> > +
> > +static bool
> > +nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
> > +		       struct nfsd_file *nf, struct nfsd_write_dio *write_dio)
> > +{
> > +	const u32 dio_blocksize = nf->nf_dio_offset_align;
> > +	loff_t start_end, orig_end, middle_end;
> > +
> > +	if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
> > +		return false;
> > +	if (unlikely(dio_blocksize > PAGE_SIZE))
> > +		return false;
> > +	if (unlikely(len < dio_blocksize))
> > +		return false;
> > +
> > +	start_end = round_up(offset, dio_blocksize);
> > +	orig_end = offset + len;
> > +	middle_end = round_down(orig_end, dio_blocksize);
> > +
> > +	write_dio->start_len = start_end - offset;
> > +	write_dio->middle_len = middle_end - start_end;
> > +	write_dio->end_len = orig_end - middle_end;
> > +
> > +	return true;
> > +}
> > +
> > +static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
> > +		unsigned int addr_mask, unsigned int len_mask)
> 
> IIRC you had mentioned that there is possibly a generic utility
> function that does this that we could adopt here. If we are going
> to keep this one in spite of having a generic, it needs a comment
> explaining why we're not using the generic version.

No, this was the generic utility function, which Keith removed with
commit b475272f03ca ("iov_iter: remove iov_iter_is_aligned") --
despite my saying I still had a need for it I gave my blessing for it
to be removed from the iov_iter code so as not to encourage more
consumers.

I lifted it into NFSD (and NFS Direct for LOCALIO actually, see
fs/nfs/localio.c:nfs_iov_iter_aligned_bvec).

SO I gave it a stay of execution, and duplicated it, due to not seeing
how to avoid needing it.

As we discussed at Bakeathon, anything that can be done to optimize
the fast path for RDMA (like was done for the READ side) would allow
us to conditionally elide this more costly checking.

You may recall you thought it was doable to expose a flag from svcrdma
to _know_ the WRITE payload aligned?  That'd be a very welcomed
follow-on optimization patch. ;)

(Sadly NFS LOCALIO is thrown to the wolves of userspace possibly
sending down misaligned buffers so it'll be made to suffer having to 
unconditionally call nfs_iov_iter_aligned_bvec)

> > +{
> > +	const struct bio_vec *bvec = i->bvec;
> > +	size_t skip = i->iov_offset;
> > +	size_t size = i->count;
> > +
> > +	if (size & len_mask)
> > +		return false;
> > +	do {
> > +		size_t len = bvec->bv_len;
> > +
> > +		if (len > size)
> > +			len = size;
> > +		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
> > +			return false;
> > +		bvec++;
> > +		size -= len;
> > +		skip = 0;
> > +	} while (size);
> > +
> > +	return true;
> > +}
> > +
> > +/*
> > + * Setup as many as 3 iov_iter based on extents described by @write_dio.
> > + * Returns the number of iov_iter that were setup.
> > + */
> > +static int
> > +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
> > +			   struct bio_vec *rq_bvec, unsigned int nvecs,
> > +			   unsigned long cnt, struct nfsd_write_dio *write_dio,
> > +			   struct nfsd_file *nf)
> > +{
> > +	int n_iters = 0;
> > +	struct iov_iter *iters = *iterp;
> > +
> > +	/* Setup misaligned start? */
> > +	if (write_dio->start_len) {
> > +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> > +		iters[n_iters].count = write_dio->start_len;
> > +		iter_is_dio_aligned[n_iters] = false;
> > +		++n_iters;
> > +	}
> > +
> > +	/* Setup DIO-aligned middle */
> > +	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> > +	if (write_dio->start_len)
> > +		iov_iter_advance(&iters[n_iters], write_dio->start_len);
> > +	iters[n_iters].count -= write_dio->end_len;
> > +	iter_is_dio_aligned[n_iters] =
> > +		nfsd_iov_iter_aligned_bvec(&iters[n_iters],
> > +				nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1);
> > +	if (unlikely(!iter_is_dio_aligned[n_iters]))
> > +		return 0; /* no DIO-aligned IO possible */
> > +	++n_iters;
> > +
> > +	/* Setup misaligned end? */
> > +	if (write_dio->end_len) {
> > +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> > +		iov_iter_advance(&iters[n_iters],
> > +				 write_dio->start_len + write_dio->middle_len);
> > +		iter_is_dio_aligned[n_iters] = false;
> > +		++n_iters;
> > +	}
> > +
> > +	return n_iters;
> > +}
> > +
> > +static int
> > +nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file,
> > +		    unsigned int nvecs, unsigned long *cnt,
> > +		    struct kiocb *kiocb)
> > +{
> > +	struct iov_iter iter;
> > +	int host_err;
> > +
> > +	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> > +	host_err = vfs_iocb_iter_write(file, kiocb, &iter);
> > +	if (host_err < 0)
> > +		return host_err;
> > +	*cnt = host_err;
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
> > +		     u32 *stable_how, unsigned int nvecs, unsigned long *cnt,
> > +		     struct kiocb *kiocb, struct nfsd_write_dio *write_dio)
> > +{
> > +	struct file *file = nf->nf_file;
> > +	bool iter_is_dio_aligned[3];
> > +	struct iov_iter iter_stack[3];
> > +	struct iov_iter *iter = iter_stack;
> > +	unsigned int n_iters = 0;
> > +	unsigned long in_count = *cnt;
> > +	loff_t in_offset = kiocb->ki_pos;
> > +	ssize_t host_err;
> > +
> > +	n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
> > +				rqstp->rq_bvec, nvecs, *cnt, write_dio, nf);
> > +	if (unlikely(!n_iters))
> > +		return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb);
> > +
> > +	trace_nfsd_write_direct(rqstp, fhp, in_offset, in_count);
> > +
> > +	/*
> > +	 * Any buffered IO issued here will be misaligned, use
> > +	 * sync IO to ensure it has completed before returning.
> > +	 * Also update @stable_how to avoid need for COMMIT.
> > +	 */
> > +	kiocb->ki_flags |= (IOCB_DSYNC|IOCB_SYNC);
> > +	*stable_how = NFS_FILE_SYNC;
> > +
> > +	*cnt = 0;
> > +	for (int i = 0; i < n_iters; i++) {
> > +		if (iter_is_dio_aligned[i])
> > +			kiocb->ki_flags |= IOCB_DIRECT;
> > +		else
> > +			kiocb->ki_flags &= ~IOCB_DIRECT;
> > +
> > +		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
> > +		if (host_err < 0) {
> > +			/*
> > +			 * VFS will return -ENOTBLK if DIO WRITE fails to
> > +			 * invalidate the page cache. Retry using buffered IO.
> > +			 */
> 
> I'm debating with myself whether, now that NFSD is using DIO, nfserrno
> should get a mapping from ENOTBLK to nfserr_serverfault or nfserr_io,
> simply as a defensive measure.
> 
> I kind of like the idea that we get a warning in nfserrno: "hey, I
> didn't expect ENOTBLK here" so I'm leaning towards leaving it as is
> for now.

While ENOTBLK isn't expected, it is a real possibility from the MM
subsystem's inability invalidate the page cache ("for reasons").

So it isn't that NFSD would've done anything wrong.. it just needs to
deal with the possibility (as should any other DIO write in kernel).

> 
> > +			if (unlikely(host_err == -ENOTBLK)) {
> > +				kiocb->ki_flags &= ~IOCB_DIRECT;
> > +				*cnt = in_count;
> > +				kiocb->ki_pos = in_offset;
> > +				return nfsd_buffered_write(rqstp, file,
> > +							   nvecs, cnt, kiocb);
> > +			} else if (unlikely(host_err == -EINVAL)) {
> > +				struct inode *inode = d_inode(fhp->fh_dentry);
> > +
> > +				pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
> > +						    inode->i_sb->s_id, inode->i_ino);
> > +				host_err = -ESERVERFAULT;
> > +			}
> > +			return host_err;
> > +		}
> > +		*cnt += host_err;
> > +		if (host_err < iter[i].count) /* partial write? */
> > +			break;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static noinline_for_stack int
> > +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > +		  struct nfsd_file *nf, u32 *stable_how, unsigned int nvecs,
> > +		  unsigned long *cnt, struct kiocb *kiocb)
> > +{
> > +	struct nfsd_write_dio write_dio;
> > +
> > +	/*
> > +	 * Check if IOCB_DONTCACHE can be used when issuing buffered IO;
> > +	 * if so, set it to preserve intent of NFSD_IO_DIRECT (it will
> > +	 * be ignored for any DIO issued here).
> > +	 */
> > +	if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
> > +		kiocb->ki_flags |= IOCB_DONTCACHE;
> > +
> > +	if (nfsd_is_write_dio_possible(kiocb->ki_pos, *cnt, nf, &write_dio))
> > +		return nfsd_issue_write_dio(rqstp, fhp, nf, stable_how, nvecs,
> > +					    cnt, kiocb, &write_dio);
> > +
> > +	return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb);
> > +}
> > +
> >  /**
> >   * nfsd_vfs_write - write data to an already-open file
> >   * @rqstp: RPC execution context
> > @@ -1281,7 +1485,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  	u32			stable = *stable_how;
> >  	struct kiocb		kiocb;
> >  	struct svc_export	*exp;
> > -	struct iov_iter		iter;
> >  	errseq_t		since;
> >  	__be32			nfserr;
> >  	int			host_err;
> > @@ -1318,25 +1521,30 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		kiocb.ki_flags |= IOCB_DSYNC;
> >  
> >  	nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
> > -	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> > +
> >  	since = READ_ONCE(file->f_wb_err);
> >  	if (verf)
> >  		nfsd_copy_write_verifier(verf, nn);
> >  
> >  	switch (nfsd_io_cache_write) {
> > -	case NFSD_IO_BUFFERED:
> > +	case NFSD_IO_DIRECT:
> > +		host_err = nfsd_direct_write(rqstp, fhp, nf, stable_how,
> > +					     nvecs, cnt, &kiocb);
> > +		stable = *stable_how;
> >  		break;
> >  	case NFSD_IO_DONTCACHE:
> >  		if (file->f_op->fop_flags & FOP_DONTCACHE)
> >  			kiocb.ki_flags |= IOCB_DONTCACHE;
> > +		fallthrough; /* must call nfsd_buffered_write */
> 
> I'm not clear what purpose this comment serves. The fallthrough
> already states what's going to happen next.

I think it adds value. That the fallthrough happens to have a
side-effect of calling nfsd_buffered_write().. sure, I can see why you
might think it a needless comment.  The name of the function isn't the
interesting bit, that DONTCACHE needs to submit as if it buffered IO
is.

But you're welcome to remove the comment.  Or did you need me to send
a v4 to do so?

Thanks for the review!

Mike

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-17 21:03     ` Mike Snitzer
@ 2025-10-17 21:54       ` Chuck Lever
  2025-10-17 22:49         ` Mike Snitzer
  0 siblings, 1 reply; 23+ messages in thread
From: Chuck Lever @ 2025-10-17 21:54 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On 10/17/25 5:03 PM, Mike Snitzer wrote:
> On Fri, Oct 17, 2025 at 10:13:14AM -0400, Chuck Lever wrote:
>> On 10/15/25 6:13 PM, Mike Snitzer wrote:

>>> +	*cnt = 0;
>>> +	for (int i = 0; i < n_iters; i++) {
>>> +		if (iter_is_dio_aligned[i])
>>> +			kiocb->ki_flags |= IOCB_DIRECT;
>>> +		else
>>> +			kiocb->ki_flags &= ~IOCB_DIRECT;
>>> +
>>> +		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
>>> +		if (host_err < 0) {
>>> +			/*
>>> +			 * VFS will return -ENOTBLK if DIO WRITE fails to
>>> +			 * invalidate the page cache. Retry using buffered IO.
>>> +			 */
>>
>> I'm debating with myself whether, now that NFSD is using DIO, nfserrno
>> should get a mapping from ENOTBLK to nfserr_serverfault or nfserr_io,
>> simply as a defensive measure.
>>
>> I kind of like the idea that we get a warning in nfserrno: "hey, I
>> didn't expect ENOTBLK here" so I'm leaning towards leaving it as is
>> for now.
> 
> While ENOTBLK isn't expected, it is a real possibility from the MM
> subsystem's inability invalidate the page cache ("for reasons").
> 
> So it isn't that NFSD would've done anything wrong.. it just needs to
> deal with the possibility (as should any other DIO write in kernel).
Well my point is that if the MM/VFS can return this on a buffered write,
should NFSD be prepared to deal with it in other places besides this
path? I think it's handling ENOTBLK correctly here. Just wondering about
elsewhere.

This is something we can set aside and worry about after merge. It
actually has more to do with the existing parts of NFSD, not the new
direct write code path.

I'll repost the "stable_how" patch and this one together in a new
series, for Christoph, and pull that series into nfsd-testing.


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-17 21:54       ` Chuck Lever
@ 2025-10-17 22:49         ` Mike Snitzer
  0 siblings, 0 replies; 23+ messages in thread
From: Mike Snitzer @ 2025-10-17 22:49 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Fri, Oct 17, 2025 at 05:54:00PM -0400, Chuck Lever wrote:
> On 10/17/25 5:03 PM, Mike Snitzer wrote:
> > On Fri, Oct 17, 2025 at 10:13:14AM -0400, Chuck Lever wrote:
> >> On 10/15/25 6:13 PM, Mike Snitzer wrote:
> 
> >>> +	*cnt = 0;
> >>> +	for (int i = 0; i < n_iters; i++) {
> >>> +		if (iter_is_dio_aligned[i])
> >>> +			kiocb->ki_flags |= IOCB_DIRECT;
> >>> +		else
> >>> +			kiocb->ki_flags &= ~IOCB_DIRECT;
> >>> +
> >>> +		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
> >>> +		if (host_err < 0) {
> >>> +			/*
> >>> +			 * VFS will return -ENOTBLK if DIO WRITE fails to
> >>> +			 * invalidate the page cache. Retry using buffered IO.
> >>> +			 */
> >>
> >> I'm debating with myself whether, now that NFSD is using DIO, nfserrno
> >> should get a mapping from ENOTBLK to nfserr_serverfault or nfserr_io,
> >> simply as a defensive measure.
> >>
> >> I kind of like the idea that we get a warning in nfserrno: "hey, I
> >> didn't expect ENOTBLK here" so I'm leaning towards leaving it as is
> >> for now.
> > 
> > While ENOTBLK isn't expected, it is a real possibility from the MM
> > subsystem's inability invalidate the page cache ("for reasons").
> > 
> > So it isn't that NFSD would've done anything wrong.. it just needs to
> > deal with the possibility (as should any other DIO write in kernel).
>
> Well my point is that if the MM/VFS can return this on a buffered write,
> should NFSD be prepared to deal with it in other places besides this
> path? I think it's handling ENOTBLK correctly here. Just wondering about
> elsewhere.
> 
> This is something we can set aside and worry about after merge. It
> actually has more to do with the existing parts of NFSD, not the new
> direct write code path.

I'm only aware of MM returning ENOTBLK in response to a DIO write that
it couldn't invalidate the page cache for.

So existing NFSD code shouldn't be exposed AFAIK.  But yeah, worth
another look.

> I'll repost the "stable_how" patch and this one together in a new
> series, for Christoph, and pull that series into nfsd-testing.

Awesome, thanks.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  2025-10-17 14:13   ` Chuck Lever
  2025-10-17 21:03     ` Mike Snitzer
@ 2025-10-17 23:23     ` NeilBrown
  1 sibling, 0 replies; 23+ messages in thread
From: NeilBrown @ 2025-10-17 23:23 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Mike Snitzer, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Chuck Lever

On Sat, 18 Oct 2025, Chuck Lever wrote:
> On 10/15/25 6:13 PM, Mike Snitzer wrote:
> > If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
> > middle and end as needed. The large middle extent is DIO-aligned and
> > the start and/or end are misaligned. Synchronous buffered IO (with
> > preference towards using DONTCACHE) is used for the misaligned extents
> > and O_DIRECT is used for the middle DIO-aligned extent.
> > 
> > nfsd_issue_write_dio() promotes @stable_how to NFS_FILE_SYNC, which
> > allows the client to drop its dirty data and avoid needing an extra
> > COMMIT operation.
> > 
> > If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to
> > invalidate the page cache on behalf of the DIO WRITE, then
> > nfsd_issue_write_dio() will fall back to using buffered IO.
> > 
> > These changes served as the original starting point for the NFS
> > client's misaligned O_DIRECT support that landed with commit
> > c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and
> > WRITE"). But NFSD's support is simpler because it currently doesn't
> > use AIO completion.
> > 
> > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> 
> Handful of nits, and one interesting question. Please send a v4 to
> address the two comment issues below.
> 
> Question: How is NFSD supposed to act if the administrator sets the
> async export option and NFSD_IO_DIRECT at the same time?
> 
> - Direct I/O dispatched but not waited for ?

You have to wait for direct I/O.  The caller owns the memory and has to
wait for the IO path to finish using it.

> 
> - Fall back to DONTCACHE ?
> 
> - Ignore the async setting (that's what it appears to do now) ?

I think this is the only sensible approach.  If we one day are a more
formal mechanism for enabling direct IO, then we can discuss and
document the interplace with other export options, but while it ia
debugfs setting, I don't think there is any needs for subtlety.

Thanks,
NeilBrown


> 
> - Something else ?
> 
> IMO async + DIRECT seems like a disaster waiting to happen. It should
> be pretty easy to push the NFS server over a cliff because there is no
> connection between clients submitting writes and data getting persisted.
> Dirty data will just pile up in the server's memory because it can't get
> flushed fast enough.
> 
> Synchronous direct I/O has a very strong connection between how fast
> persistent storage can go and how fast clients can push more writes.
> 
> 
> > ---
> >  fs/nfsd/debugfs.c |   1 +
> >  fs/nfsd/trace.h   |   1 +
> >  fs/nfsd/vfs.c     | 218 ++++++++++++++++++++++++++++++++++++++++++++--
> >  3 files changed, 215 insertions(+), 5 deletions(-)
> > 
> > Changes since v2:
> > - rebased ontop of "[PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients"
> > - set both IOCB_DSYNC and IOCB_SYNC (rather than just IOCB_SYNC) in nfsd_issue_write_dio()
> > - update nfsd_issue_write_dio() to promote @stable_how to NFS_FILE_SYNC
> > - push call to trace_nfsd_write_direct down from nfsd_direct_write to nfsd_issue_write_dio 
> > - fix comment block style to have naked '/*' on first line
> > 
> > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > index 00eb1ecef6ac..7f44689e0a53 100644
> > --- a/fs/nfsd/debugfs.c
> > +++ b/fs/nfsd/debugfs.c
> > @@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
> >  	switch (val) {
> >  	case NFSD_IO_BUFFERED:
> >  	case NFSD_IO_DONTCACHE:
> > +	case NFSD_IO_DIRECT:
> >  		nfsd_io_cache_write = val;
> >  		break;
> >  	default:
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index bfd41236aff2..ad74439d0105 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
> >  DEFINE_NFSD_IO_EVENT(read_done);
> >  DEFINE_NFSD_IO_EVENT(write_start);
> >  DEFINE_NFSD_IO_EVENT(write_opened);
> > +DEFINE_NFSD_IO_EVENT(write_direct);
> >  DEFINE_NFSD_IO_EVENT(write_io_done);
> >  DEFINE_NFSD_IO_EVENT(write_done);
> >  DEFINE_NFSD_IO_EVENT(commit_start);
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a3dee33a7233..ba7cb698ac68 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -1253,6 +1253,210 @@ static int wait_for_concurrent_writes(struct file *file)
> >  	return err;
> >  }
> >  
> > +struct nfsd_write_dio {
> > +	ssize_t	start_len;	/* Length for misaligned first extent */
> > +	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
> > +	ssize_t	end_len;	/* Length for misaligned last extent */
> > +};
> > +
> > +static bool
> > +nfsd_is_write_dio_possible(loff_t offset, unsigned long len,
> > +		       struct nfsd_file *nf, struct nfsd_write_dio *write_dio)
> > +{
> > +	const u32 dio_blocksize = nf->nf_dio_offset_align;
> > +	loff_t start_end, orig_end, middle_end;
> > +
> > +	if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize))
> > +		return false;
> > +	if (unlikely(dio_blocksize > PAGE_SIZE))
> > +		return false;
> > +	if (unlikely(len < dio_blocksize))
> > +		return false;
> > +
> > +	start_end = round_up(offset, dio_blocksize);
> > +	orig_end = offset + len;
> > +	middle_end = round_down(orig_end, dio_blocksize);
> > +
> > +	write_dio->start_len = start_end - offset;
> > +	write_dio->middle_len = middle_end - start_end;
> > +	write_dio->end_len = orig_end - middle_end;
> > +
> > +	return true;
> > +}
> > +
> > +static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i,
> > +		unsigned int addr_mask, unsigned int len_mask)
> 
> IIRC you had mentioned that there is possibly a generic utility
> function that does this that we could adopt here. If we are going
> to keep this one in spite of having a generic, it needs a comment
> explaining why we're not using the generic version.
> 
> 
> > +{
> > +	const struct bio_vec *bvec = i->bvec;
> > +	size_t skip = i->iov_offset;
> > +	size_t size = i->count;
> > +
> > +	if (size & len_mask)
> > +		return false;
> > +	do {
> > +		size_t len = bvec->bv_len;
> > +
> > +		if (len > size)
> > +			len = size;
> > +		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
> > +			return false;
> > +		bvec++;
> > +		size -= len;
> > +		skip = 0;
> > +	} while (size);
> > +
> > +	return true;
> > +}
> > +
> > +/*
> > + * Setup as many as 3 iov_iter based on extents described by @write_dio.
> > + * Returns the number of iov_iter that were setup.
> > + */
> > +static int
> > +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
> > +			   struct bio_vec *rq_bvec, unsigned int nvecs,
> > +			   unsigned long cnt, struct nfsd_write_dio *write_dio,
> > +			   struct nfsd_file *nf)
> > +{
> > +	int n_iters = 0;
> > +	struct iov_iter *iters = *iterp;
> > +
> > +	/* Setup misaligned start? */
> > +	if (write_dio->start_len) {
> > +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> > +		iters[n_iters].count = write_dio->start_len;
> > +		iter_is_dio_aligned[n_iters] = false;
> > +		++n_iters;
> > +	}
> > +
> > +	/* Setup DIO-aligned middle */
> > +	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> > +	if (write_dio->start_len)
> > +		iov_iter_advance(&iters[n_iters], write_dio->start_len);
> > +	iters[n_iters].count -= write_dio->end_len;
> > +	iter_is_dio_aligned[n_iters] =
> > +		nfsd_iov_iter_aligned_bvec(&iters[n_iters],
> > +				nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1);
> > +	if (unlikely(!iter_is_dio_aligned[n_iters]))
> > +		return 0; /* no DIO-aligned IO possible */
> > +	++n_iters;
> > +
> > +	/* Setup misaligned end? */
> > +	if (write_dio->end_len) {
> > +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
> > +		iov_iter_advance(&iters[n_iters],
> > +				 write_dio->start_len + write_dio->middle_len);
> > +		iter_is_dio_aligned[n_iters] = false;
> > +		++n_iters;
> > +	}
> > +
> > +	return n_iters;
> > +}
> > +
> > +static int
> > +nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file,
> > +		    unsigned int nvecs, unsigned long *cnt,
> > +		    struct kiocb *kiocb)
> > +{
> > +	struct iov_iter iter;
> > +	int host_err;
> > +
> > +	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> > +	host_err = vfs_iocb_iter_write(file, kiocb, &iter);
> > +	if (host_err < 0)
> > +		return host_err;
> > +	*cnt = host_err;
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
> > +		     u32 *stable_how, unsigned int nvecs, unsigned long *cnt,
> > +		     struct kiocb *kiocb, struct nfsd_write_dio *write_dio)
> > +{
> > +	struct file *file = nf->nf_file;
> > +	bool iter_is_dio_aligned[3];
> > +	struct iov_iter iter_stack[3];
> > +	struct iov_iter *iter = iter_stack;
> > +	unsigned int n_iters = 0;
> > +	unsigned long in_count = *cnt;
> > +	loff_t in_offset = kiocb->ki_pos;
> > +	ssize_t host_err;
> > +
> > +	n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned,
> > +				rqstp->rq_bvec, nvecs, *cnt, write_dio, nf);
> > +	if (unlikely(!n_iters))
> > +		return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb);
> > +
> > +	trace_nfsd_write_direct(rqstp, fhp, in_offset, in_count);
> > +
> > +	/*
> > +	 * Any buffered IO issued here will be misaligned, use
> > +	 * sync IO to ensure it has completed before returning.
> > +	 * Also update @stable_how to avoid need for COMMIT.
> > +	 */
> > +	kiocb->ki_flags |= (IOCB_DSYNC|IOCB_SYNC);
> > +	*stable_how = NFS_FILE_SYNC;
> > +
> > +	*cnt = 0;
> > +	for (int i = 0; i < n_iters; i++) {
> > +		if (iter_is_dio_aligned[i])
> > +			kiocb->ki_flags |= IOCB_DIRECT;
> > +		else
> > +			kiocb->ki_flags &= ~IOCB_DIRECT;
> > +
> > +		host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]);
> > +		if (host_err < 0) {
> > +			/*
> > +			 * VFS will return -ENOTBLK if DIO WRITE fails to
> > +			 * invalidate the page cache. Retry using buffered IO.
> > +			 */
> 
> I'm debating with myself whether, now that NFSD is using DIO, nfserrno
> should get a mapping from ENOTBLK to nfserr_serverfault or nfserr_io,
> simply as a defensive measure.
> 
> I kind of like the idea that we get a warning in nfserrno: "hey, I
> didn't expect ENOTBLK here" so I'm leaning towards leaving it as is
> for now.
> 
> 
> > +			if (unlikely(host_err == -ENOTBLK)) {
> > +				kiocb->ki_flags &= ~IOCB_DIRECT;
> > +				*cnt = in_count;
> > +				kiocb->ki_pos = in_offset;
> > +				return nfsd_buffered_write(rqstp, file,
> > +							   nvecs, cnt, kiocb);
> > +			} else if (unlikely(host_err == -EINVAL)) {
> > +				struct inode *inode = d_inode(fhp->fh_dentry);
> > +
> > +				pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
> > +						    inode->i_sb->s_id, inode->i_ino);
> > +				host_err = -ESERVERFAULT;
> > +			}
> > +			return host_err;
> > +		}
> > +		*cnt += host_err;
> > +		if (host_err < iter[i].count) /* partial write? */
> > +			break;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static noinline_for_stack int
> > +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > +		  struct nfsd_file *nf, u32 *stable_how, unsigned int nvecs,
> > +		  unsigned long *cnt, struct kiocb *kiocb)
> > +{
> > +	struct nfsd_write_dio write_dio;
> > +
> > +	/*
> > +	 * Check if IOCB_DONTCACHE can be used when issuing buffered IO;
> > +	 * if so, set it to preserve intent of NFSD_IO_DIRECT (it will
> > +	 * be ignored for any DIO issued here).
> > +	 */
> > +	if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
> > +		kiocb->ki_flags |= IOCB_DONTCACHE;
> > +
> > +	if (nfsd_is_write_dio_possible(kiocb->ki_pos, *cnt, nf, &write_dio))
> > +		return nfsd_issue_write_dio(rqstp, fhp, nf, stable_how, nvecs,
> > +					    cnt, kiocb, &write_dio);
> > +
> > +	return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb);
> > +}
> > +
> >  /**
> >   * nfsd_vfs_write - write data to an already-open file
> >   * @rqstp: RPC execution context
> > @@ -1281,7 +1485,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  	u32			stable = *stable_how;
> >  	struct kiocb		kiocb;
> >  	struct svc_export	*exp;
> > -	struct iov_iter		iter;
> >  	errseq_t		since;
> >  	__be32			nfserr;
> >  	int			host_err;
> > @@ -1318,25 +1521,30 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		kiocb.ki_flags |= IOCB_DSYNC;
> >  
> >  	nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
> > -	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
> > +
> >  	since = READ_ONCE(file->f_wb_err);
> >  	if (verf)
> >  		nfsd_copy_write_verifier(verf, nn);
> >  
> >  	switch (nfsd_io_cache_write) {
> > -	case NFSD_IO_BUFFERED:
> > +	case NFSD_IO_DIRECT:
> > +		host_err = nfsd_direct_write(rqstp, fhp, nf, stable_how,
> > +					     nvecs, cnt, &kiocb);
> > +		stable = *stable_how;
> >  		break;
> >  	case NFSD_IO_DONTCACHE:
> >  		if (file->f_op->fop_flags & FOP_DONTCACHE)
> >  			kiocb.ki_flags |= IOCB_DONTCACHE;
> > +		fallthrough; /* must call nfsd_buffered_write */
> 
> I'm not clear what purpose this comment serves. The fallthrough
> already states what's going to happen next.
> 
> 
> > +	case NFSD_IO_BUFFERED:
> > +		host_err = nfsd_buffered_write(rqstp, file,
> > +					       nvecs, cnt, &kiocb);
> >  		break;
> >  	}
> > -	host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
> >  	if (host_err < 0) {
> >  		commit_reset_write_verifier(nn, rqstp, host_err);
> >  		goto out_nfserr;
> >  	}
> > -	*cnt = host_err;
> >  	nfsd_stats_io_write_add(nn, exp, *cnt);
> >  	fsnotify_modify(file);
> >  	host_err = filemap_check_wb_err(file->f_mapping, since);
> 
> 
> -- 
> Chuck Lever
> 


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-13 19:01 [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
                   ` (3 preceding siblings ...)
  2025-10-15 22:13 ` [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
@ 2025-10-17  4:08 ` Christoph Hellwig
  2025-10-17 13:27   ` Chuck Lever
  4 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2025-10-17  4:08 UTC (permalink / raw)
  To: Chuck Lever
  Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Mike Snitzer, Chuck Lever

On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> not need a subsequent COMMIT operation, saving a round trip and
> allowing the client to dispense with cached dirty data as soon as
> it receives the server's WRITE response.

What's the subsequent patch doing?  Having the actual behavior change
in the same series would really help to understand what is going on
here.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-17  4:08 ` [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Christoph Hellwig
@ 2025-10-17 13:27   ` Chuck Lever
  2025-10-20  6:59     ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Chuck Lever @ 2025-10-17 13:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	linux-nfs, Mike Snitzer, Chuck Lever

On 10/17/25 12:08 AM, Christoph Hellwig wrote:
> On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>>
>> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
>> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
>> not need a subsequent COMMIT operation, saving a round trip and
>> allowing the client to dispense with cached dirty data as soon as
>> it receives the server's WRITE response.
> 
> What's the subsequent patch doing?  Having the actual behavior change
> in the same series would really help to understand what is going on
> here.
The subsequent patch is:

  https://lore.kernel.org/linux-nfs/aPAci7O_XK1ljaum@kernel.org/

We haven't put the two together yet.


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients
  2025-10-17 13:27   ` Chuck Lever
@ 2025-10-20  6:59     ` Christoph Hellwig
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2025-10-20  6:59 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Christoph Hellwig, NeilBrown, Jeff Layton, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, Mike Snitzer, Chuck Lever

On Fri, Oct 17, 2025 at 09:27:03AM -0400, Chuck Lever wrote:
> On 10/17/25 12:08 AM, Christoph Hellwig wrote:
> > On Mon, Oct 13, 2025 at 03:01:13PM -0400, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >>
> >> In a subsequent patch, nfsd_vfs_write() will promote an UNSTABLE
> >> WRITE to be a FILE_SYNC WRITE. This indicates that the client does
> >> not need a subsequent COMMIT operation, saving a round trip and
> >> allowing the client to dispense with cached dirty data as soon as
> >> it receives the server's WRITE response.
> > 
> > What's the subsequent patch doing?  Having the actual behavior change
> > in the same series would really help to understand what is going on
> > here.
> The subsequent patch is:
> 
>   https://lore.kernel.org/linux-nfs/aPAci7O_XK1ljaum@kernel.org/
> 
> We haven't put the two together yet.

Ah, that was a bit confused.  Not helped by reading it at the airport
after an overnight flight :)


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2025-10-20  6:59 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-13 19:01 [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Chuck Lever
2025-10-13 19:34 ` Jeff Layton
2025-10-14  5:24 ` NeilBrown
2025-10-15 16:53 ` Mike Snitzer
2025-10-15 18:00   ` Chuck Lever
2025-10-15 18:11     ` Jeff Layton
2025-10-15 18:16       ` Chuck Lever
2025-10-17  6:22       ` NeilBrown
2025-10-17 11:09         ` Jeff Layton
2025-10-17 23:15           ` NeilBrown
2025-10-17 13:43         ` Chuck Lever
2025-10-17 23:19           ` NeilBrown
2025-10-15 18:40     ` Mike Snitzer
2025-10-15 22:13 ` [PATCH v3] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE Mike Snitzer
2025-10-16 16:10   ` Jeff Layton
2025-10-17 14:13   ` Chuck Lever
2025-10-17 21:03     ` Mike Snitzer
2025-10-17 21:54       ` Chuck Lever
2025-10-17 22:49         ` Mike Snitzer
2025-10-17 23:23     ` NeilBrown
2025-10-17  4:08 ` [PATCH v1] NFSD: Enable return of an updated stable_how to NFS clients Christoph Hellwig
2025-10-17 13:27   ` Chuck Lever
2025-10-20  6:59     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).