* [PATCH v3 1/3] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
2025-09-22 14:11 [PATCH v3 0/3] NFSD direct I/O read Chuck Lever
@ 2025-09-22 14:11 ` Chuck Lever
2025-09-24 14:00 ` Chuck Lever
2025-09-22 14:11 ` [PATCH v3 2/3] NFSD: pass nfsd_file to nfsd_iter_read() Chuck Lever
2025-09-22 14:11 ` [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ Chuck Lever
2 siblings, 1 reply; 19+ messages in thread
From: Chuck Lever @ 2025-09-22 14:11 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Mike Snitzer
From: Mike Snitzer <snitzer@kernel.org>
Use STATX_DIOALIGN and STATX_DIO_READ_ALIGN to get DIO alignment
attributes from the underlying filesystem and store them in the
associated nfsd_file. This is done when the nfsd_file is first
opened for each regular file.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/filecache.c | 34 ++++++++++++++++++++++++++++++++++
fs/nfsd/filecache.h | 4 ++++
fs/nfsd/nfsfh.c | 4 ++++
fs/nfsd/trace.h | 27 +++++++++++++++++++++++++++
include/trace/misc/fs.h | 22 ++++++++++++++++++++++
5 files changed, 91 insertions(+)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 75bc48031c07..78cca0d751ac 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
refcount_set(&nf->nf_ref, 1);
nf->nf_may = need;
nf->nf_mark = NULL;
+ nf->nf_dio_mem_align = 0;
+ nf->nf_dio_offset_align = 0;
+ nf->nf_dio_read_offset_align = 0;
return nf;
}
@@ -1048,6 +1051,35 @@ nfsd_file_is_cached(struct inode *inode)
return ret;
}
+static __be32
+nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf)
+{
+ struct inode *inode = file_inode(nf->nf_file);
+ struct kstat stat;
+ __be32 status;
+
+ /* Currently only need to get DIO alignment info for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return nfs_ok;
+
+ status = fh_getattr(fhp, &stat);
+ if (status != nfs_ok)
+ return status;
+
+ trace_nfsd_file_get_dio_attrs(inode, &stat);
+
+ if (stat.result_mask & STATX_DIOALIGN) {
+ nf->nf_dio_mem_align = stat.dio_mem_align;
+ nf->nf_dio_offset_align = stat.dio_offset_align;
+ }
+ if (stat.result_mask & STATX_DIO_READ_ALIGN)
+ nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
+ else
+ nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
+
+ return nfs_ok;
+}
+
static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
struct svc_cred *cred,
@@ -1166,6 +1198,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
}
status = nfserrno(ret);
trace_nfsd_file_open(nf, status);
+ if (status == nfs_ok)
+ status = nfsd_file_get_dio_attrs(fhp, nf);
}
} else
status = nfserr_jukebox;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 24ddf60e8434..e3d6ca2b6030 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -54,6 +54,10 @@ struct nfsd_file {
struct list_head nf_gc;
struct rcu_head nf_rcu;
ktime_t nf_birthtime;
+
+ u32 nf_dio_mem_align;
+ u32 nf_dio_offset_align;
+ u32 nf_dio_read_offset_align;
};
int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3edccc38db42..3eb724ec9566 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -696,8 +696,12 @@ __be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat)
.mnt = fhp->fh_export->ex_path.mnt,
.dentry = fhp->fh_dentry,
};
+ struct inode *inode = d_inode(p.dentry);
u32 request_mask = STATX_BASIC_STATS;
+ if (S_ISREG(inode->i_mode))
+ request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN);
+
if (fhp->fh_maxsize == NFS4_FHSIZE)
request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a664fdf1161e..6e2c8e2aab10 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc,
)
);
+TRACE_EVENT(nfsd_file_get_dio_attrs,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct kstat *stat
+ ),
+ TP_ARGS(inode, stat),
+ TP_STRUCT__entry(
+ __field(const void *, inode)
+ __field(unsigned long, mask)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(u32, read_offset_align)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->mask = stat->result_mask;
+ __entry->mem_align = stat->dio_mem_align;
+ __entry->offset_align = stat->dio_offset_align;
+ __entry->read_offset_align = stat->dio_read_offset_align;
+ ),
+ TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u",
+ __entry->inode, show_statx_mask(__entry->mask),
+ __entry->mem_align, __entry->offset_align,
+ __entry->read_offset_align
+ )
+);
+
TRACE_EVENT(nfsd_file_acquire,
TP_PROTO(
const struct svc_rqst *rqstp,
diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h
index 0406ebe2a80a..7ead1c61f0cb 100644
--- a/include/trace/misc/fs.h
+++ b/include/trace/misc/fs.h
@@ -141,3 +141,25 @@
{ ATTR_TIMES_SET, "TIMES_SET" }, \
{ ATTR_TOUCH, "TOUCH"}, \
{ ATTR_DELEG, "DELEG"})
+
+#define show_statx_mask(flags) \
+ __print_flags(flags, "|", \
+ { STATX_TYPE, "TYPE" }, \
+ { STATX_MODE, "MODE" }, \
+ { STATX_NLINK, "NLINK" }, \
+ { STATX_UID, "UID" }, \
+ { STATX_GID, "GID" }, \
+ { STATX_ATIME, "ATIME" }, \
+ { STATX_MTIME, "MTIME" }, \
+ { STATX_CTIME, "CTIME" }, \
+ { STATX_INO, "INO" }, \
+ { STATX_SIZE, "SIZE" }, \
+ { STATX_BLOCKS, "BLOCKS" }, \
+ { STATX_BASIC_STATS, "BASIC_STATS" }, \
+ { STATX_BTIME, "BTIME" }, \
+ { STATX_MNT_ID, "MNT_ID" }, \
+ { STATX_DIOALIGN, "DIOALIGN" }, \
+ { STATX_MNT_ID_UNIQUE, "MNT_ID_UNIQUE" }, \
+ { STATX_SUBVOL, "SUBVOL" }, \
+ { STATX_WRITE_ATOMIC, "WRITE_ATOMIC" }, \
+ { STATX_DIO_READ_ALIGN, "DIO_READ_ALIGN" })
--
2.51.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [PATCH v3 1/3] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
2025-09-22 14:11 ` [PATCH v3 1/3] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Chuck Lever
@ 2025-09-24 14:00 ` Chuck Lever
0 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2025-09-24 14:00 UTC (permalink / raw)
To: Anna Schumaker
Cc: linux-nfs, Mike Snitzer, NeilBrown, Jeff Layton,
Olga Kornievskaia, Dai Ngo, Tom Talpey
On 9/22/25 7:11 AM, Chuck Lever wrote:
> From: Mike Snitzer <snitzer@kernel.org>
>
> Use STATX_DIOALIGN and STATX_DIO_READ_ALIGN to get DIO alignment
> attributes from the underlying filesystem and store them in the
> associated nfsd_file. This is done when the nfsd_file is first
> opened for each regular file.
>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> Reviewed-by: NeilBrown <neil@brown.name>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
v3 of this patch is the one that should be included for the client
series you plan to merge.
When you apply it, please remove "Signed-off-by: Chuck ..." and
replace it with:
Acked-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> fs/nfsd/filecache.c | 34 ++++++++++++++++++++++++++++++++++
> fs/nfsd/filecache.h | 4 ++++
> fs/nfsd/nfsfh.c | 4 ++++
> fs/nfsd/trace.h | 27 +++++++++++++++++++++++++++
> include/trace/misc/fs.h | 22 ++++++++++++++++++++++
> 5 files changed, 91 insertions(+)
>
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 75bc48031c07..78cca0d751ac 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
> refcount_set(&nf->nf_ref, 1);
> nf->nf_may = need;
> nf->nf_mark = NULL;
> + nf->nf_dio_mem_align = 0;
> + nf->nf_dio_offset_align = 0;
> + nf->nf_dio_read_offset_align = 0;
> return nf;
> }
>
> @@ -1048,6 +1051,35 @@ nfsd_file_is_cached(struct inode *inode)
> return ret;
> }
>
> +static __be32
> +nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf)
> +{
> + struct inode *inode = file_inode(nf->nf_file);
> + struct kstat stat;
> + __be32 status;
> +
> + /* Currently only need to get DIO alignment info for regular files */
> + if (!S_ISREG(inode->i_mode))
> + return nfs_ok;
> +
> + status = fh_getattr(fhp, &stat);
> + if (status != nfs_ok)
> + return status;
> +
> + trace_nfsd_file_get_dio_attrs(inode, &stat);
> +
> + if (stat.result_mask & STATX_DIOALIGN) {
> + nf->nf_dio_mem_align = stat.dio_mem_align;
> + nf->nf_dio_offset_align = stat.dio_offset_align;
> + }
> + if (stat.result_mask & STATX_DIO_READ_ALIGN)
> + nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
> + else
> + nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
> +
> + return nfs_ok;
> +}
> +
> static __be32
> nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
> struct svc_cred *cred,
> @@ -1166,6 +1198,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
> }
> status = nfserrno(ret);
> trace_nfsd_file_open(nf, status);
> + if (status == nfs_ok)
> + status = nfsd_file_get_dio_attrs(fhp, nf);
> }
> } else
> status = nfserr_jukebox;
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index 24ddf60e8434..e3d6ca2b6030 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -54,6 +54,10 @@ struct nfsd_file {
> struct list_head nf_gc;
> struct rcu_head nf_rcu;
> ktime_t nf_birthtime;
> +
> + u32 nf_dio_mem_align;
> + u32 nf_dio_offset_align;
> + u32 nf_dio_read_offset_align;
> };
>
> int nfsd_file_cache_init(void);
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index 3edccc38db42..3eb724ec9566 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -696,8 +696,12 @@ __be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat)
> .mnt = fhp->fh_export->ex_path.mnt,
> .dentry = fhp->fh_dentry,
> };
> + struct inode *inode = d_inode(p.dentry);
> u32 request_mask = STATX_BASIC_STATS;
>
> + if (S_ISREG(inode->i_mode))
> + request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN);
> +
> if (fhp->fh_maxsize == NFS4_FHSIZE)
> request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
>
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index a664fdf1161e..6e2c8e2aab10 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc,
> )
> );
>
> +TRACE_EVENT(nfsd_file_get_dio_attrs,
> + TP_PROTO(
> + const struct inode *inode,
> + const struct kstat *stat
> + ),
> + TP_ARGS(inode, stat),
> + TP_STRUCT__entry(
> + __field(const void *, inode)
> + __field(unsigned long, mask)
> + __field(u32, mem_align)
> + __field(u32, offset_align)
> + __field(u32, read_offset_align)
> + ),
> + TP_fast_assign(
> + __entry->inode = inode;
> + __entry->mask = stat->result_mask;
> + __entry->mem_align = stat->dio_mem_align;
> + __entry->offset_align = stat->dio_offset_align;
> + __entry->read_offset_align = stat->dio_read_offset_align;
> + ),
> + TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u",
> + __entry->inode, show_statx_mask(__entry->mask),
> + __entry->mem_align, __entry->offset_align,
> + __entry->read_offset_align
> + )
> +);
> +
> TRACE_EVENT(nfsd_file_acquire,
> TP_PROTO(
> const struct svc_rqst *rqstp,
> diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h
> index 0406ebe2a80a..7ead1c61f0cb 100644
> --- a/include/trace/misc/fs.h
> +++ b/include/trace/misc/fs.h
> @@ -141,3 +141,25 @@
> { ATTR_TIMES_SET, "TIMES_SET" }, \
> { ATTR_TOUCH, "TOUCH"}, \
> { ATTR_DELEG, "DELEG"})
> +
> +#define show_statx_mask(flags) \
> + __print_flags(flags, "|", \
> + { STATX_TYPE, "TYPE" }, \
> + { STATX_MODE, "MODE" }, \
> + { STATX_NLINK, "NLINK" }, \
> + { STATX_UID, "UID" }, \
> + { STATX_GID, "GID" }, \
> + { STATX_ATIME, "ATIME" }, \
> + { STATX_MTIME, "MTIME" }, \
> + { STATX_CTIME, "CTIME" }, \
> + { STATX_INO, "INO" }, \
> + { STATX_SIZE, "SIZE" }, \
> + { STATX_BLOCKS, "BLOCKS" }, \
> + { STATX_BASIC_STATS, "BASIC_STATS" }, \
> + { STATX_BTIME, "BTIME" }, \
> + { STATX_MNT_ID, "MNT_ID" }, \
> + { STATX_DIOALIGN, "DIOALIGN" }, \
> + { STATX_MNT_ID_UNIQUE, "MNT_ID_UNIQUE" }, \
> + { STATX_SUBVOL, "SUBVOL" }, \
> + { STATX_WRITE_ATOMIC, "WRITE_ATOMIC" }, \
> + { STATX_DIO_READ_ALIGN, "DIO_READ_ALIGN" })
--
Chuck Lever
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v3 2/3] NFSD: pass nfsd_file to nfsd_iter_read()
2025-09-22 14:11 [PATCH v3 0/3] NFSD direct I/O read Chuck Lever
2025-09-22 14:11 ` [PATCH v3 1/3] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Chuck Lever
@ 2025-09-22 14:11 ` Chuck Lever
2025-09-22 14:11 ` [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ Chuck Lever
2 siblings, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2025-09-22 14:11 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Mike Snitzer
From: Mike Snitzer <snitzer@kernel.org>
Prepare for nfsd_iter_read() to use the DIO alignment stored in
nfsd_file by passing the nfsd_file to nfsd_iter_read() rather than
just the file which is associaed with the nfsd_file.
This means nfsd4_encode_readv() now also needs the nfsd_file rather
than the file. Instead of changing the file arg to be the nfsd_file,
we discard the file arg as the nfsd_file (and indeed the file) is
already available via the "read" argument.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfs4xdr.c | 8 ++++----
fs/nfsd/vfs.c | 7 ++++---
fs/nfsd/vfs.h | 2 +-
3 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 97e9e9afa80a..41f0d54d6e1b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4467,7 +4467,7 @@ static __be32 nfsd4_encode_splice_read(
static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
unsigned int base = xdr->buf->page_len & ~PAGE_MASK;
@@ -4478,7 +4478,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
if (xdr_reserve_space_vec(xdr, maxcount) < 0)
return nfserr_resource;
- nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
+ nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, read->rd_nf,
read->rd_offset, &maxcount, base,
&read->rd_eof);
read->rd_length = maxcount;
@@ -4525,7 +4525,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
if (file->f_op->splice_read && splice_ok)
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
- nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+ nfserr = nfsd4_encode_readv(resp, read, maxcount);
if (nfserr) {
xdr_truncate_encode(xdr, eof_offset);
return nfserr;
@@ -5421,7 +5421,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
if (file->f_op->splice_read && splice_ok)
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
- nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+ nfserr = nfsd4_encode_readv(resp, read, maxcount);
if (nfserr)
return nfserr;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2026431500ec..35880d3f1326 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1078,7 +1078,7 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
* nfsd_iter_read - Perform a VFS read using an iterator
* @rqstp: RPC transaction context
* @fhp: file handle of file to be read
- * @file: opened struct file of file to be read
+ * @nf: opened struct nfsd_file of file to be read
* @offset: starting byte offset
* @count: IN: requested number of bytes; OUT: number of bytes read
* @base: offset in first page of read buffer
@@ -1091,9 +1091,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
* returned.
*/
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file *file, loff_t offset, unsigned long *count,
+ struct nfsd_file *nf, loff_t offset, unsigned long *count,
unsigned int base, u32 *eof)
{
+ struct file *file = nf->nf_file;
unsigned long v, total;
struct iov_iter iter;
struct kiocb kiocb;
@@ -1336,7 +1337,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp))
err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
else
- err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
+ err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof);
nfsd_file_put(nf);
trace_nfsd_read_done(rqstp, fhp, offset, *count);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 0c0292611c6d..fa46f8b5f132 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -121,7 +121,7 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned long *count,
u32 *eof);
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file *file, loff_t offset,
+ struct nfsd_file *nf, loff_t offset,
unsigned long *count, unsigned int base,
u32 *eof);
bool nfsd_read_splice_ok(struct svc_rqst *rqstp);
--
2.51.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-22 14:11 [PATCH v3 0/3] NFSD direct I/O read Chuck Lever
2025-09-22 14:11 ` [PATCH v3 1/3] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Chuck Lever
2025-09-22 14:11 ` [PATCH v3 2/3] NFSD: pass nfsd_file to nfsd_iter_read() Chuck Lever
@ 2025-09-22 14:11 ` Chuck Lever
2025-09-23 22:26 ` Mike Snitzer
2 siblings, 1 reply; 19+ messages in thread
From: Chuck Lever @ 2025-09-22 14:11 UTC (permalink / raw)
To: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey
Cc: linux-nfs, Chuck Lever, Mike Snitzer
From: Chuck Lever <chuck.lever@oracle.com>
Add an experimental option that forces NFS READ operations to use
direct I/O instead of reading through the NFS server's page cache.
There are already other layers of caching:
- The page cache on NFS clients
- The block device underlying the exported file system
The server's page cache, in many cases, is unlikely to provide
additional benefit. Some benchmarks have demonstrated that the
server's page cache is actively detrimental for workloads whose
working set is larger than the server's available physical memory.
For instance, on small NFS servers, cached NFS file content can
squeeze out local memory consumers. For large sequential workloads,
an enormous amount of data flows into and out of the page cache
and is consumed by NFS clients exactly once -- caching that data
is expensive to do and totally valueless.
For now this is a hidden option that can be enabled on test
systems for benchmarking. In the longer term, this option might
be enabled persistently or per-export. When the exported file
system does not support direct I/O, NFSD falls back to using
either DONTCACHE or buffered I/O to fulfill NFS READ requests.
Suggested-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/debugfs.c | 2 ++
fs/nfsd/nfsd.h | 1 +
fs/nfsd/trace.h | 1 +
fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 86 insertions(+)
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index ed2b9e066206..00eb1ecef6ac 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
* Contents:
* %0: NFS READ will use buffered IO
* %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
+ * %2: NFS READ will use direct IO
*
* This setting takes immediate effect for all NFS versions,
* all exports, and in all NFSD net namespaces.
@@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
nfsd_io_cache_read = NFSD_IO_BUFFERED;
break;
case NFSD_IO_DONTCACHE:
+ case NFSD_IO_DIRECT:
/*
* Must disable splice_read when enabling
* NFSD_IO_DONTCACHE.
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index ea87b42894dd..bdb60ee1f1a4 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -157,6 +157,7 @@ enum {
/* Any new NFSD_IO enum value must be added at the end */
NFSD_IO_BUFFERED,
NFSD_IO_DONTCACHE,
+ NFSD_IO_DIRECT,
};
extern u64 nfsd_io_cache_read __read_mostly;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 6e2c8e2aab10..bfd41236aff2 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
DEFINE_NFSD_IO_EVENT(read_start);
DEFINE_NFSD_IO_EVENT(read_splice);
DEFINE_NFSD_IO_EVENT(read_vector);
+DEFINE_NFSD_IO_EVENT(read_direct);
DEFINE_NFSD_IO_EVENT(read_io_done);
DEFINE_NFSD_IO_EVENT(read_done);
DEFINE_NFSD_IO_EVENT(write_start);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 35880d3f1326..ddcd812f0761 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
+/*
+ * The byte range of the client's READ request is expanded on both
+ * ends until it meets the underlying file system's direct I/O
+ * alignment requirements. After the internal read is complete, the
+ * byte range of the NFS READ payload is reduced to the byte range
+ * that was originally requested.
+ *
+ * Note that a direct read can be done only when the xdr_buf
+ * containing the NFS READ reply does not already have contents in
+ * its .pages array. This is due to potentially restrictive
+ * alignment requirements on the read buffer. When .page_len and
+ * @base are zero, the .pages array is guaranteed to be page-
+ * aligned.
+ */
+static noinline_for_stack __be32
+nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset, unsigned long *count,
+ u32 *eof)
+{
+ loff_t dio_start, dio_end;
+ unsigned long v, total;
+ struct iov_iter iter;
+ struct kiocb kiocb;
+ ssize_t host_err;
+ size_t len;
+
+ init_sync_kiocb(&kiocb, nf->nf_file);
+ kiocb.ki_flags |= IOCB_DIRECT;
+
+ /* Read a properly-aligned region of bytes into rq_bvec */
+ dio_start = round_down(offset, nf->nf_dio_read_offset_align);
+ dio_end = round_up(offset + *count, nf->nf_dio_read_offset_align);
+
+ kiocb.ki_pos = dio_start;
+
+ v = 0;
+ total = dio_end - dio_start;
+ while (total && v < rqstp->rq_maxpages &&
+ rqstp->rq_next_page < rqstp->rq_page_end) {
+ len = min_t(size_t, total, PAGE_SIZE);
+ bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page,
+ len, 0);
+
+ total -= len;
+ ++rqstp->rq_next_page;
+ ++v;
+ }
+
+ trace_nfsd_read_direct(rqstp, fhp, offset, *count - total);
+ iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v,
+ dio_end - dio_start - total);
+
+ host_err = vfs_iocb_iter_read(nf->nf_file, &kiocb, &iter);
+ if (host_err >= 0) {
+ unsigned int pad = offset - dio_start;
+
+ /* The returned payload starts after the pad */
+ rqstp->rq_res.page_base = pad;
+
+ /* Compute the count of bytes to be returned */
+ if (host_err > pad + *count) {
+ host_err = *count;
+ } else if (host_err > pad) {
+ host_err -= pad;
+ } else {
+ host_err = 0;
+ }
+ } else if (unlikely(host_err == -EINVAL)) {
+ pr_info_ratelimited("nfsd: Unexpected direct I/O alignment failure\n");
+ host_err = -ESERVERFAULT;
+ }
+
+ return nfsd_finish_read(rqstp, fhp, nf->nf_file, offset, count,
+ eof, host_err);
+}
+
/**
* nfsd_iter_read - Perform a VFS read using an iterator
* @rqstp: RPC transaction context
@@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
switch (nfsd_io_cache_read) {
case NFSD_IO_BUFFERED:
break;
+ case NFSD_IO_DIRECT:
+ if (nf->nf_dio_read_offset_align &&
+ rqstp->rq_res.page_len && !base)
+ return nfsd_direct_read(rqstp, fhp, nf, offset,
+ count, eof);
+ fallthrough;
case NFSD_IO_DONTCACHE:
if (file->f_op->fop_flags & FOP_DONTCACHE)
kiocb.ki_flags = IOCB_DONTCACHE;
--
2.51.0
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-22 14:11 ` [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ Chuck Lever
@ 2025-09-23 22:26 ` Mike Snitzer
2025-09-23 22:49 ` Chuck Lever
0 siblings, 1 reply; 19+ messages in thread
From: Mike Snitzer @ 2025-09-23 22:26 UTC (permalink / raw)
To: Chuck Lever
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
>
> Add an experimental option that forces NFS READ operations to use
> direct I/O instead of reading through the NFS server's page cache.
>
> There are already other layers of caching:
> - The page cache on NFS clients
> - The block device underlying the exported file system
>
> The server's page cache, in many cases, is unlikely to provide
> additional benefit. Some benchmarks have demonstrated that the
> server's page cache is actively detrimental for workloads whose
> working set is larger than the server's available physical memory.
>
> For instance, on small NFS servers, cached NFS file content can
> squeeze out local memory consumers. For large sequential workloads,
> an enormous amount of data flows into and out of the page cache
> and is consumed by NFS clients exactly once -- caching that data
> is expensive to do and totally valueless.
>
> For now this is a hidden option that can be enabled on test
> systems for benchmarking. In the longer term, this option might
> be enabled persistently or per-export. When the exported file
> system does not support direct I/O, NFSD falls back to using
> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
>
> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> fs/nfsd/debugfs.c | 2 ++
> fs/nfsd/nfsd.h | 1 +
> fs/nfsd/trace.h | 1 +
> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 86 insertions(+)
>
> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> index ed2b9e066206..00eb1ecef6ac 100644
> --- a/fs/nfsd/debugfs.c
> +++ b/fs/nfsd/debugfs.c
> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> * Contents:
> * %0: NFS READ will use buffered IO
> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> + * %2: NFS READ will use direct IO
> *
> * This setting takes immediate effect for all NFS versions,
> * all exports, and in all NFSD net namespaces.
> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> break;
> case NFSD_IO_DONTCACHE:
> + case NFSD_IO_DIRECT:
> /*
> * Must disable splice_read when enabling
> * NFSD_IO_DONTCACHE.
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index ea87b42894dd..bdb60ee1f1a4 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -157,6 +157,7 @@ enum {
> /* Any new NFSD_IO enum value must be added at the end */
> NFSD_IO_BUFFERED,
> NFSD_IO_DONTCACHE,
> + NFSD_IO_DIRECT,
> };
>
> extern u64 nfsd_io_cache_read __read_mostly;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 6e2c8e2aab10..bfd41236aff2 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> DEFINE_NFSD_IO_EVENT(read_start);
> DEFINE_NFSD_IO_EVENT(read_splice);
> DEFINE_NFSD_IO_EVENT(read_vector);
> +DEFINE_NFSD_IO_EVENT(read_direct);
> DEFINE_NFSD_IO_EVENT(read_io_done);
> DEFINE_NFSD_IO_EVENT(read_done);
> DEFINE_NFSD_IO_EVENT(write_start);
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 35880d3f1326..ddcd812f0761 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> }
>
> +/*
> + * The byte range of the client's READ request is expanded on both
> + * ends until it meets the underlying file system's direct I/O
> + * alignment requirements. After the internal read is complete, the
> + * byte range of the NFS READ payload is reduced to the byte range
> + * that was originally requested.
> + *
> + * Note that a direct read can be done only when the xdr_buf
> + * containing the NFS READ reply does not already have contents in
> + * its .pages array. This is due to potentially restrictive
> + * alignment requirements on the read buffer. When .page_len and
> + * @base are zero, the .pages array is guaranteed to be page-
> + * aligned.
> + */
So this ^ comment (and the related conversation with Neil in a
different thread) says page_len should be 0 on entry to
nfsd_direct_read.
> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> switch (nfsd_io_cache_read) {
> case NFSD_IO_BUFFERED:
> break;
> + case NFSD_IO_DIRECT:
> + if (nf->nf_dio_read_offset_align &&
> + rqstp->rq_res.page_len && !base)
> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> + count, eof);
> + fallthrough;
Yet the nfsd_iter_read is only calling nfsd_direct_read() if
rqstp->rq_res.page_len is not zero, shouldn't it be
!rqstp->rq_res.page_len ?
(testing confirms it should be !rqstp->rq_res.page_len)
Hopefully with this fix you can have more confidence in staging this
in your nfsd-testing?
Thanks,
Mike
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-23 22:26 ` Mike Snitzer
@ 2025-09-23 22:49 ` Chuck Lever
2025-09-23 23:31 ` Mike Snitzer
2025-09-23 23:48 ` NeilBrown
0 siblings, 2 replies; 19+ messages in thread
From: Chuck Lever @ 2025-09-23 22:49 UTC (permalink / raw)
To: Mike Snitzer
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On 9/23/25 3:26 PM, Mike Snitzer wrote:
> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>>
>> Add an experimental option that forces NFS READ operations to use
>> direct I/O instead of reading through the NFS server's page cache.
>>
>> There are already other layers of caching:
>> - The page cache on NFS clients
>> - The block device underlying the exported file system
>>
>> The server's page cache, in many cases, is unlikely to provide
>> additional benefit. Some benchmarks have demonstrated that the
>> server's page cache is actively detrimental for workloads whose
>> working set is larger than the server's available physical memory.
>>
>> For instance, on small NFS servers, cached NFS file content can
>> squeeze out local memory consumers. For large sequential workloads,
>> an enormous amount of data flows into and out of the page cache
>> and is consumed by NFS clients exactly once -- caching that data
>> is expensive to do and totally valueless.
>>
>> For now this is a hidden option that can be enabled on test
>> systems for benchmarking. In the longer term, this option might
>> be enabled persistently or per-export. When the exported file
>> system does not support direct I/O, NFSD falls back to using
>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
>>
>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfsd/debugfs.c | 2 ++
>> fs/nfsd/nfsd.h | 1 +
>> fs/nfsd/trace.h | 1 +
>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 86 insertions(+)
>>
>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
>> index ed2b9e066206..00eb1ecef6ac 100644
>> --- a/fs/nfsd/debugfs.c
>> +++ b/fs/nfsd/debugfs.c
>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
>> * Contents:
>> * %0: NFS READ will use buffered IO
>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
>> + * %2: NFS READ will use direct IO
>> *
>> * This setting takes immediate effect for all NFS versions,
>> * all exports, and in all NFSD net namespaces.
>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
>> break;
>> case NFSD_IO_DONTCACHE:
>> + case NFSD_IO_DIRECT:
>> /*
>> * Must disable splice_read when enabling
>> * NFSD_IO_DONTCACHE.
>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>> index ea87b42894dd..bdb60ee1f1a4 100644
>> --- a/fs/nfsd/nfsd.h
>> +++ b/fs/nfsd/nfsd.h
>> @@ -157,6 +157,7 @@ enum {
>> /* Any new NFSD_IO enum value must be added at the end */
>> NFSD_IO_BUFFERED,
>> NFSD_IO_DONTCACHE,
>> + NFSD_IO_DIRECT,
>> };
>>
>> extern u64 nfsd_io_cache_read __read_mostly;
>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
>> index 6e2c8e2aab10..bfd41236aff2 100644
>> --- a/fs/nfsd/trace.h
>> +++ b/fs/nfsd/trace.h
>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
>> DEFINE_NFSD_IO_EVENT(read_start);
>> DEFINE_NFSD_IO_EVENT(read_splice);
>> DEFINE_NFSD_IO_EVENT(read_vector);
>> +DEFINE_NFSD_IO_EVENT(read_direct);
>> DEFINE_NFSD_IO_EVENT(read_io_done);
>> DEFINE_NFSD_IO_EVENT(read_done);
>> DEFINE_NFSD_IO_EVENT(write_start);
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index 35880d3f1326..ddcd812f0761 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
>> }
>>
>> +/*
>> + * The byte range of the client's READ request is expanded on both
>> + * ends until it meets the underlying file system's direct I/O
>> + * alignment requirements. After the internal read is complete, the
>> + * byte range of the NFS READ payload is reduced to the byte range
>> + * that was originally requested.
>> + *
>> + * Note that a direct read can be done only when the xdr_buf
>> + * containing the NFS READ reply does not already have contents in
>> + * its .pages array. This is due to potentially restrictive
>> + * alignment requirements on the read buffer. When .page_len and
>> + * @base are zero, the .pages array is guaranteed to be page-
>> + * aligned.
>> + */
>
> So this ^ comment (and the related conversation with Neil in a
> different thread) says page_len should be 0 on entry to
> nfsd_direct_read.
>
>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> switch (nfsd_io_cache_read) {
>> case NFSD_IO_BUFFERED:
>> break;
>> + case NFSD_IO_DIRECT:
>> + if (nf->nf_dio_read_offset_align &&
>> + rqstp->rq_res.page_len && !base)
>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
>> + count, eof);
>> + fallthrough;
>
> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> rqstp->rq_res.page_len is not zero, shouldn't it be
> !rqstp->rq_res.page_len ?
Oops, yes. I did this work last week, while out of range of my lab.
> (testing confirms it should be !rqstp->rq_res.page_len)
>
> Hopefully with this fix you can have more confidence in staging this
> in your nfsd-testing?
I'm waiting only for Neil to send an R-b.
--
Chuck Lever
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-23 22:49 ` Chuck Lever
@ 2025-09-23 23:31 ` Mike Snitzer
2025-09-23 23:48 ` NeilBrown
1 sibling, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2025-09-23 23:31 UTC (permalink / raw)
To: Chuck Lever
Cc: NeilBrown, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Tue, Sep 23, 2025 at 06:49:35PM -0400, Chuck Lever wrote:
> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> > On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >>
> >> Add an experimental option that forces NFS READ operations to use
> >> direct I/O instead of reading through the NFS server's page cache.
> >>
> >> There are already other layers of caching:
> >> - The page cache on NFS clients
> >> - The block device underlying the exported file system
> >>
> >> The server's page cache, in many cases, is unlikely to provide
> >> additional benefit. Some benchmarks have demonstrated that the
> >> server's page cache is actively detrimental for workloads whose
> >> working set is larger than the server's available physical memory.
> >>
> >> For instance, on small NFS servers, cached NFS file content can
> >> squeeze out local memory consumers. For large sequential workloads,
> >> an enormous amount of data flows into and out of the page cache
> >> and is consumed by NFS clients exactly once -- caching that data
> >> is expensive to do and totally valueless.
> >>
> >> For now this is a hidden option that can be enabled on test
> >> systems for benchmarking. In the longer term, this option might
> >> be enabled persistently or per-export. When the exported file
> >> system does not support direct I/O, NFSD falls back to using
> >> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> >>
> >> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> >> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> >> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> fs/nfsd/debugfs.c | 2 ++
> >> fs/nfsd/nfsd.h | 1 +
> >> fs/nfsd/trace.h | 1 +
> >> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> >> 4 files changed, 86 insertions(+)
> >>
> >> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> >> index ed2b9e066206..00eb1ecef6ac 100644
> >> --- a/fs/nfsd/debugfs.c
> >> +++ b/fs/nfsd/debugfs.c
> >> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> >> * Contents:
> >> * %0: NFS READ will use buffered IO
> >> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> >> + * %2: NFS READ will use direct IO
> >> *
> >> * This setting takes immediate effect for all NFS versions,
> >> * all exports, and in all NFSD net namespaces.
> >> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> >> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> >> break;
> >> case NFSD_IO_DONTCACHE:
> >> + case NFSD_IO_DIRECT:
> >> /*
> >> * Must disable splice_read when enabling
> >> * NFSD_IO_DONTCACHE.
> >> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >> index ea87b42894dd..bdb60ee1f1a4 100644
> >> --- a/fs/nfsd/nfsd.h
> >> +++ b/fs/nfsd/nfsd.h
> >> @@ -157,6 +157,7 @@ enum {
> >> /* Any new NFSD_IO enum value must be added at the end */
> >> NFSD_IO_BUFFERED,
> >> NFSD_IO_DONTCACHE,
> >> + NFSD_IO_DIRECT,
> >> };
> >>
> >> extern u64 nfsd_io_cache_read __read_mostly;
> >> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> >> index 6e2c8e2aab10..bfd41236aff2 100644
> >> --- a/fs/nfsd/trace.h
> >> +++ b/fs/nfsd/trace.h
> >> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> >> DEFINE_NFSD_IO_EVENT(read_start);
> >> DEFINE_NFSD_IO_EVENT(read_splice);
> >> DEFINE_NFSD_IO_EVENT(read_vector);
> >> +DEFINE_NFSD_IO_EVENT(read_direct);
> >> DEFINE_NFSD_IO_EVENT(read_io_done);
> >> DEFINE_NFSD_IO_EVENT(read_done);
> >> DEFINE_NFSD_IO_EVENT(write_start);
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index 35880d3f1326..ddcd812f0761 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> >> }
> >>
> >> +/*
> >> + * The byte range of the client's READ request is expanded on both
> >> + * ends until it meets the underlying file system's direct I/O
> >> + * alignment requirements. After the internal read is complete, the
> >> + * byte range of the NFS READ payload is reduced to the byte range
> >> + * that was originally requested.
> >> + *
> >> + * Note that a direct read can be done only when the xdr_buf
> >> + * containing the NFS READ reply does not already have contents in
> >> + * its .pages array. This is due to potentially restrictive
> >> + * alignment requirements on the read buffer. When .page_len and
> >> + * @base are zero, the .pages array is guaranteed to be page-
> >> + * aligned.
> >> + */
> >
> > So this ^ comment (and the related conversation with Neil in a
> > different thread) says page_len should be 0 on entry to
> > nfsd_direct_read.
> >
> >> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >> switch (nfsd_io_cache_read) {
> >> case NFSD_IO_BUFFERED:
> >> break;
> >> + case NFSD_IO_DIRECT:
> >> + if (nf->nf_dio_read_offset_align &&
> >> + rqstp->rq_res.page_len && !base)
> >> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> >> + count, eof);
> >> + fallthrough;
> >
> > Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> > rqstp->rq_res.page_len is not zero, shouldn't it be
> > !rqstp->rq_res.page_len ?
>
> Oops, yes. I did this work last week, while out of range of my lab.
Sure.
>
> > (testing confirms it should be !rqstp->rq_res.page_len)
> >
> > Hopefully with this fix you can have more confidence in staging this
> > in your nfsd-testing?
> I'm waiting only for Neil to send an R-b.
OK, makes sense. For some reaosn I thought you had that for patch 3.
Mike
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-23 22:49 ` Chuck Lever
2025-09-23 23:31 ` Mike Snitzer
@ 2025-09-23 23:48 ` NeilBrown
2025-09-23 23:52 ` Chuck Lever
1 sibling, 1 reply; 19+ messages in thread
From: NeilBrown @ 2025-09-23 23:48 UTC (permalink / raw)
To: Chuck Lever
Cc: Mike Snitzer, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Wed, 24 Sep 2025, Chuck Lever wrote:
> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> > On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >>
> >> Add an experimental option that forces NFS READ operations to use
> >> direct I/O instead of reading through the NFS server's page cache.
> >>
> >> There are already other layers of caching:
> >> - The page cache on NFS clients
> >> - The block device underlying the exported file system
> >>
> >> The server's page cache, in many cases, is unlikely to provide
> >> additional benefit. Some benchmarks have demonstrated that the
> >> server's page cache is actively detrimental for workloads whose
> >> working set is larger than the server's available physical memory.
> >>
> >> For instance, on small NFS servers, cached NFS file content can
> >> squeeze out local memory consumers. For large sequential workloads,
> >> an enormous amount of data flows into and out of the page cache
> >> and is consumed by NFS clients exactly once -- caching that data
> >> is expensive to do and totally valueless.
> >>
> >> For now this is a hidden option that can be enabled on test
> >> systems for benchmarking. In the longer term, this option might
> >> be enabled persistently or per-export. When the exported file
> >> system does not support direct I/O, NFSD falls back to using
> >> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> >>
> >> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> >> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> >> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> fs/nfsd/debugfs.c | 2 ++
> >> fs/nfsd/nfsd.h | 1 +
> >> fs/nfsd/trace.h | 1 +
> >> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> >> 4 files changed, 86 insertions(+)
> >>
> >> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> >> index ed2b9e066206..00eb1ecef6ac 100644
> >> --- a/fs/nfsd/debugfs.c
> >> +++ b/fs/nfsd/debugfs.c
> >> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> >> * Contents:
> >> * %0: NFS READ will use buffered IO
> >> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> >> + * %2: NFS READ will use direct IO
> >> *
> >> * This setting takes immediate effect for all NFS versions,
> >> * all exports, and in all NFSD net namespaces.
> >> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> >> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> >> break;
> >> case NFSD_IO_DONTCACHE:
> >> + case NFSD_IO_DIRECT:
> >> /*
> >> * Must disable splice_read when enabling
> >> * NFSD_IO_DONTCACHE.
> >> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >> index ea87b42894dd..bdb60ee1f1a4 100644
> >> --- a/fs/nfsd/nfsd.h
> >> +++ b/fs/nfsd/nfsd.h
> >> @@ -157,6 +157,7 @@ enum {
> >> /* Any new NFSD_IO enum value must be added at the end */
> >> NFSD_IO_BUFFERED,
> >> NFSD_IO_DONTCACHE,
> >> + NFSD_IO_DIRECT,
> >> };
> >>
> >> extern u64 nfsd_io_cache_read __read_mostly;
> >> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> >> index 6e2c8e2aab10..bfd41236aff2 100644
> >> --- a/fs/nfsd/trace.h
> >> +++ b/fs/nfsd/trace.h
> >> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> >> DEFINE_NFSD_IO_EVENT(read_start);
> >> DEFINE_NFSD_IO_EVENT(read_splice);
> >> DEFINE_NFSD_IO_EVENT(read_vector);
> >> +DEFINE_NFSD_IO_EVENT(read_direct);
> >> DEFINE_NFSD_IO_EVENT(read_io_done);
> >> DEFINE_NFSD_IO_EVENT(read_done);
> >> DEFINE_NFSD_IO_EVENT(write_start);
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index 35880d3f1326..ddcd812f0761 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> >> }
> >>
> >> +/*
> >> + * The byte range of the client's READ request is expanded on both
> >> + * ends until it meets the underlying file system's direct I/O
> >> + * alignment requirements. After the internal read is complete, the
> >> + * byte range of the NFS READ payload is reduced to the byte range
> >> + * that was originally requested.
> >> + *
> >> + * Note that a direct read can be done only when the xdr_buf
> >> + * containing the NFS READ reply does not already have contents in
> >> + * its .pages array. This is due to potentially restrictive
> >> + * alignment requirements on the read buffer. When .page_len and
> >> + * @base are zero, the .pages array is guaranteed to be page-
> >> + * aligned.
> >> + */
> >
> > So this ^ comment (and the related conversation with Neil in a
> > different thread) says page_len should be 0 on entry to
> > nfsd_direct_read.
> >
> >> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >> switch (nfsd_io_cache_read) {
> >> case NFSD_IO_BUFFERED:
> >> break;
> >> + case NFSD_IO_DIRECT:
> >> + if (nf->nf_dio_read_offset_align &&
> >> + rqstp->rq_res.page_len && !base)
> >> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> >> + count, eof);
> >> + fallthrough;
> >
> > Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> > rqstp->rq_res.page_len is not zero, shouldn't it be
> > !rqstp->rq_res.page_len ?
>
> Oops, yes. I did this work last week, while out of range of my lab.
>
>
> > (testing confirms it should be !rqstp->rq_res.page_len)
> >
> > Hopefully with this fix you can have more confidence in staging this
> > in your nfsd-testing?
> I'm waiting only for Neil to send an R-b.
After noticing, like Mike, that the page_len test was inverted I went
looking to see where page_len was updated, to be sure that a second READ
request would not try to use DIRECT IO.
I can see that nfsd_splice_actor() updates page_len, but I cannot see
where it is updated when nfsd_iter_read() is used.
What am I missing?
NeilBrown
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-23 23:48 ` NeilBrown
@ 2025-09-23 23:52 ` Chuck Lever
2025-09-24 7:30 ` NeilBrown
0 siblings, 1 reply; 19+ messages in thread
From: Chuck Lever @ 2025-09-23 23:52 UTC (permalink / raw)
To: NeilBrown
Cc: Mike Snitzer, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On 9/23/25 4:48 PM, NeilBrown wrote:
> On Wed, 24 Sep 2025, Chuck Lever wrote:
>> On 9/23/25 3:26 PM, Mike Snitzer wrote:
>>> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>
>>>> Add an experimental option that forces NFS READ operations to use
>>>> direct I/O instead of reading through the NFS server's page cache.
>>>>
>>>> There are already other layers of caching:
>>>> - The page cache on NFS clients
>>>> - The block device underlying the exported file system
>>>>
>>>> The server's page cache, in many cases, is unlikely to provide
>>>> additional benefit. Some benchmarks have demonstrated that the
>>>> server's page cache is actively detrimental for workloads whose
>>>> working set is larger than the server's available physical memory.
>>>>
>>>> For instance, on small NFS servers, cached NFS file content can
>>>> squeeze out local memory consumers. For large sequential workloads,
>>>> an enormous amount of data flows into and out of the page cache
>>>> and is consumed by NFS clients exactly once -- caching that data
>>>> is expensive to do and totally valueless.
>>>>
>>>> For now this is a hidden option that can be enabled on test
>>>> systems for benchmarking. In the longer term, this option might
>>>> be enabled persistently or per-export. When the exported file
>>>> system does not support direct I/O, NFSD falls back to using
>>>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
>>>>
>>>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
>>>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>> fs/nfsd/debugfs.c | 2 ++
>>>> fs/nfsd/nfsd.h | 1 +
>>>> fs/nfsd/trace.h | 1 +
>>>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
>>>> 4 files changed, 86 insertions(+)
>>>>
>>>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
>>>> index ed2b9e066206..00eb1ecef6ac 100644
>>>> --- a/fs/nfsd/debugfs.c
>>>> +++ b/fs/nfsd/debugfs.c
>>>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
>>>> * Contents:
>>>> * %0: NFS READ will use buffered IO
>>>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
>>>> + * %2: NFS READ will use direct IO
>>>> *
>>>> * This setting takes immediate effect for all NFS versions,
>>>> * all exports, and in all NFSD net namespaces.
>>>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
>>>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
>>>> break;
>>>> case NFSD_IO_DONTCACHE:
>>>> + case NFSD_IO_DIRECT:
>>>> /*
>>>> * Must disable splice_read when enabling
>>>> * NFSD_IO_DONTCACHE.
>>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>>>> index ea87b42894dd..bdb60ee1f1a4 100644
>>>> --- a/fs/nfsd/nfsd.h
>>>> +++ b/fs/nfsd/nfsd.h
>>>> @@ -157,6 +157,7 @@ enum {
>>>> /* Any new NFSD_IO enum value must be added at the end */
>>>> NFSD_IO_BUFFERED,
>>>> NFSD_IO_DONTCACHE,
>>>> + NFSD_IO_DIRECT,
>>>> };
>>>>
>>>> extern u64 nfsd_io_cache_read __read_mostly;
>>>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
>>>> index 6e2c8e2aab10..bfd41236aff2 100644
>>>> --- a/fs/nfsd/trace.h
>>>> +++ b/fs/nfsd/trace.h
>>>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
>>>> DEFINE_NFSD_IO_EVENT(read_start);
>>>> DEFINE_NFSD_IO_EVENT(read_splice);
>>>> DEFINE_NFSD_IO_EVENT(read_vector);
>>>> +DEFINE_NFSD_IO_EVENT(read_direct);
>>>> DEFINE_NFSD_IO_EVENT(read_io_done);
>>>> DEFINE_NFSD_IO_EVENT(read_done);
>>>> DEFINE_NFSD_IO_EVENT(write_start);
>>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>>> index 35880d3f1326..ddcd812f0761 100644
>>>> --- a/fs/nfsd/vfs.c
>>>> +++ b/fs/nfsd/vfs.c
>>>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
>>>> }
>>>>
>>>> +/*
>>>> + * The byte range of the client's READ request is expanded on both
>>>> + * ends until it meets the underlying file system's direct I/O
>>>> + * alignment requirements. After the internal read is complete, the
>>>> + * byte range of the NFS READ payload is reduced to the byte range
>>>> + * that was originally requested.
>>>> + *
>>>> + * Note that a direct read can be done only when the xdr_buf
>>>> + * containing the NFS READ reply does not already have contents in
>>>> + * its .pages array. This is due to potentially restrictive
>>>> + * alignment requirements on the read buffer. When .page_len and
>>>> + * @base are zero, the .pages array is guaranteed to be page-
>>>> + * aligned.
>>>> + */
>>>
>>> So this ^ comment (and the related conversation with Neil in a
>>> different thread) says page_len should be 0 on entry to
>>> nfsd_direct_read.
>>>
>>>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>>> switch (nfsd_io_cache_read) {
>>>> case NFSD_IO_BUFFERED:
>>>> break;
>>>> + case NFSD_IO_DIRECT:
>>>> + if (nf->nf_dio_read_offset_align &&
>>>> + rqstp->rq_res.page_len && !base)
>>>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
>>>> + count, eof);
>>>> + fallthrough;
>>>
>>> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
>>> rqstp->rq_res.page_len is not zero, shouldn't it be
>>> !rqstp->rq_res.page_len ?
>>
>> Oops, yes. I did this work last week, while out of range of my lab.
>>
>>
>>> (testing confirms it should be !rqstp->rq_res.page_len)
>>>
>>> Hopefully with this fix you can have more confidence in staging this
>>> in your nfsd-testing?
>> I'm waiting only for Neil to send an R-b.
>
> After noticing, like Mike, that the page_len test was inverted I went
> looking to see where page_len was updated, to be sure that a second READ
> request would not try to use DIRECT IO.
> I can see that nfsd_splice_actor() updates page_len, but I cannot see
> where it is updated when nfsd_iter_read() is used.
>
> What am I missing?
It might be updated while the NFSv4 reply encoder is encoding a
COMPOUND. If the size of the RPC reply so far is larger than the
xdr_buf's .head, the xdr_stream will be positioned somewhere in the
xdr_buf's .pages array.
This is precisely why splice READ can be used only for the first
NFSv4 READ operation in a COMPOUND. Subsequent READ operations
must use nfsd_iter_read().
--
Chuck Lever
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-23 23:52 ` Chuck Lever
@ 2025-09-24 7:30 ` NeilBrown
2025-09-24 14:12 ` Mike Snitzer
2025-09-24 18:56 ` Chuck Lever
0 siblings, 2 replies; 19+ messages in thread
From: NeilBrown @ 2025-09-24 7:30 UTC (permalink / raw)
To: Chuck Lever
Cc: Mike Snitzer, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Wed, 24 Sep 2025, Chuck Lever wrote:
> On 9/23/25 4:48 PM, NeilBrown wrote:
> > On Wed, 24 Sep 2025, Chuck Lever wrote:
> >> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> >>> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> >>>> From: Chuck Lever <chuck.lever@oracle.com>
> >>>>
> >>>> Add an experimental option that forces NFS READ operations to use
> >>>> direct I/O instead of reading through the NFS server's page cache.
> >>>>
> >>>> There are already other layers of caching:
> >>>> - The page cache on NFS clients
> >>>> - The block device underlying the exported file system
> >>>>
> >>>> The server's page cache, in many cases, is unlikely to provide
> >>>> additional benefit. Some benchmarks have demonstrated that the
> >>>> server's page cache is actively detrimental for workloads whose
> >>>> working set is larger than the server's available physical memory.
> >>>>
> >>>> For instance, on small NFS servers, cached NFS file content can
> >>>> squeeze out local memory consumers. For large sequential workloads,
> >>>> an enormous amount of data flows into and out of the page cache
> >>>> and is consumed by NFS clients exactly once -- caching that data
> >>>> is expensive to do and totally valueless.
> >>>>
> >>>> For now this is a hidden option that can be enabled on test
> >>>> systems for benchmarking. In the longer term, this option might
> >>>> be enabled persistently or per-export. When the exported file
> >>>> system does not support direct I/O, NFSD falls back to using
> >>>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> >>>>
> >>>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> >>>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> >>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> >>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >>>> ---
> >>>> fs/nfsd/debugfs.c | 2 ++
> >>>> fs/nfsd/nfsd.h | 1 +
> >>>> fs/nfsd/trace.h | 1 +
> >>>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> >>>> 4 files changed, 86 insertions(+)
> >>>>
> >>>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> >>>> index ed2b9e066206..00eb1ecef6ac 100644
> >>>> --- a/fs/nfsd/debugfs.c
> >>>> +++ b/fs/nfsd/debugfs.c
> >>>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> >>>> * Contents:
> >>>> * %0: NFS READ will use buffered IO
> >>>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> >>>> + * %2: NFS READ will use direct IO
> >>>> *
> >>>> * This setting takes immediate effect for all NFS versions,
> >>>> * all exports, and in all NFSD net namespaces.
> >>>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> >>>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> >>>> break;
> >>>> case NFSD_IO_DONTCACHE:
> >>>> + case NFSD_IO_DIRECT:
> >>>> /*
> >>>> * Must disable splice_read when enabling
> >>>> * NFSD_IO_DONTCACHE.
> >>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >>>> index ea87b42894dd..bdb60ee1f1a4 100644
> >>>> --- a/fs/nfsd/nfsd.h
> >>>> +++ b/fs/nfsd/nfsd.h
> >>>> @@ -157,6 +157,7 @@ enum {
> >>>> /* Any new NFSD_IO enum value must be added at the end */
> >>>> NFSD_IO_BUFFERED,
> >>>> NFSD_IO_DONTCACHE,
> >>>> + NFSD_IO_DIRECT,
> >>>> };
> >>>>
> >>>> extern u64 nfsd_io_cache_read __read_mostly;
> >>>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> >>>> index 6e2c8e2aab10..bfd41236aff2 100644
> >>>> --- a/fs/nfsd/trace.h
> >>>> +++ b/fs/nfsd/trace.h
> >>>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> >>>> DEFINE_NFSD_IO_EVENT(read_start);
> >>>> DEFINE_NFSD_IO_EVENT(read_splice);
> >>>> DEFINE_NFSD_IO_EVENT(read_vector);
> >>>> +DEFINE_NFSD_IO_EVENT(read_direct);
> >>>> DEFINE_NFSD_IO_EVENT(read_io_done);
> >>>> DEFINE_NFSD_IO_EVENT(read_done);
> >>>> DEFINE_NFSD_IO_EVENT(write_start);
> >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >>>> index 35880d3f1326..ddcd812f0761 100644
> >>>> --- a/fs/nfsd/vfs.c
> >>>> +++ b/fs/nfsd/vfs.c
> >>>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> >>>> }
> >>>>
> >>>> +/*
> >>>> + * The byte range of the client's READ request is expanded on both
> >>>> + * ends until it meets the underlying file system's direct I/O
> >>>> + * alignment requirements. After the internal read is complete, the
> >>>> + * byte range of the NFS READ payload is reduced to the byte range
> >>>> + * that was originally requested.
> >>>> + *
> >>>> + * Note that a direct read can be done only when the xdr_buf
> >>>> + * containing the NFS READ reply does not already have contents in
> >>>> + * its .pages array. This is due to potentially restrictive
> >>>> + * alignment requirements on the read buffer. When .page_len and
> >>>> + * @base are zero, the .pages array is guaranteed to be page-
> >>>> + * aligned.
> >>>> + */
> >>>
> >>> So this ^ comment (and the related conversation with Neil in a
> >>> different thread) says page_len should be 0 on entry to
> >>> nfsd_direct_read.
> >>>
> >>>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>>> switch (nfsd_io_cache_read) {
> >>>> case NFSD_IO_BUFFERED:
> >>>> break;
> >>>> + case NFSD_IO_DIRECT:
> >>>> + if (nf->nf_dio_read_offset_align &&
> >>>> + rqstp->rq_res.page_len && !base)
> >>>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> >>>> + count, eof);
> >>>> + fallthrough;
> >>>
> >>> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> >>> rqstp->rq_res.page_len is not zero, shouldn't it be
> >>> !rqstp->rq_res.page_len ?
> >>
> >> Oops, yes. I did this work last week, while out of range of my lab.
> >>
> >>
> >>> (testing confirms it should be !rqstp->rq_res.page_len)
> >>>
> >>> Hopefully with this fix you can have more confidence in staging this
> >>> in your nfsd-testing?
> >> I'm waiting only for Neil to send an R-b.
> >
> > After noticing, like Mike, that the page_len test was inverted I went
> > looking to see where page_len was updated, to be sure that a second READ
> > request would not try to use DIRECT IO.
> > I can see that nfsd_splice_actor() updates page_len, but I cannot see
> > where it is updated when nfsd_iter_read() is used.
> >
> > What am I missing?
>
> It might be updated while the NFSv4 reply encoder is encoding a
> COMPOUND. If the size of the RPC reply so far is larger than the
> xdr_buf's .head, the xdr_stream will be positioned somewhere in the
> xdr_buf's .pages array.
>
> This is precisely why splice READ can be used only for the first
> NFSv4 READ operation in a COMPOUND. Subsequent READ operations
> must use nfsd_iter_read().
Hmmmm...
nfsd4_encode_readv() calls xdr_reserve_space_vec() passing maxcount from
the READ request. The maxcount is passed to xdr_reserve_space()
repeatedly (one page at a time) where it is added to xdr->buf->page_len
(where xdr is ->rq_res_stream and xdr->buf is rq_res).
So the result is often that rq_res.page_len will be maxcount.
Then nfsd4_encode_readv() calls nfsd_iter_read() which, with this patch,
will test rq_res.page_len, which will always be non-zero.
So that isn't what we want.
(after the read, nfsd4_encode_readv() will call xdr_truncate_encode()
which will reduce ->page_len based on how much was read).
Then nfsd4_encode_readv() calls nfsd_iter_read().
I don't think we can use ->page_len to determine if it is safe to use
nfsd_direct_read(). The conditions where nfsd_direct_read() is safe are
similar to the conditions where nfsd_splice_read() is safe. Maybe we
should use similar code.
Thanks,
NeilBrown
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-24 7:30 ` NeilBrown
@ 2025-09-24 14:12 ` Mike Snitzer
2025-09-24 15:18 ` Mike Snitzer
2025-09-26 0:18 ` [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ NeilBrown
2025-09-24 18:56 ` Chuck Lever
1 sibling, 2 replies; 19+ messages in thread
From: Mike Snitzer @ 2025-09-24 14:12 UTC (permalink / raw)
To: NeilBrown
Cc: Chuck Lever, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Wed, Sep 24, 2025 at 05:30:38PM +1000, NeilBrown wrote:
> On Wed, 24 Sep 2025, Chuck Lever wrote:
> > On 9/23/25 4:48 PM, NeilBrown wrote:
> > > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > >> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> > >>> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> > >>>> From: Chuck Lever <chuck.lever@oracle.com>
> > >>>>
> > >>>> Add an experimental option that forces NFS READ operations to use
> > >>>> direct I/O instead of reading through the NFS server's page cache.
> > >>>>
> > >>>> There are already other layers of caching:
> > >>>> - The page cache on NFS clients
> > >>>> - The block device underlying the exported file system
> > >>>>
> > >>>> The server's page cache, in many cases, is unlikely to provide
> > >>>> additional benefit. Some benchmarks have demonstrated that the
> > >>>> server's page cache is actively detrimental for workloads whose
> > >>>> working set is larger than the server's available physical memory.
> > >>>>
> > >>>> For instance, on small NFS servers, cached NFS file content can
> > >>>> squeeze out local memory consumers. For large sequential workloads,
> > >>>> an enormous amount of data flows into and out of the page cache
> > >>>> and is consumed by NFS clients exactly once -- caching that data
> > >>>> is expensive to do and totally valueless.
> > >>>>
> > >>>> For now this is a hidden option that can be enabled on test
> > >>>> systems for benchmarking. In the longer term, this option might
> > >>>> be enabled persistently or per-export. When the exported file
> > >>>> system does not support direct I/O, NFSD falls back to using
> > >>>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> > >>>>
> > >>>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> > >>>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> > >>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > >>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > >>>> ---
> > >>>> fs/nfsd/debugfs.c | 2 ++
> > >>>> fs/nfsd/nfsd.h | 1 +
> > >>>> fs/nfsd/trace.h | 1 +
> > >>>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> > >>>> 4 files changed, 86 insertions(+)
> > >>>>
> > >>>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > >>>> index ed2b9e066206..00eb1ecef6ac 100644
> > >>>> --- a/fs/nfsd/debugfs.c
> > >>>> +++ b/fs/nfsd/debugfs.c
> > >>>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> > >>>> * Contents:
> > >>>> * %0: NFS READ will use buffered IO
> > >>>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> > >>>> + * %2: NFS READ will use direct IO
> > >>>> *
> > >>>> * This setting takes immediate effect for all NFS versions,
> > >>>> * all exports, and in all NFSD net namespaces.
> > >>>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> > >>>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> > >>>> break;
> > >>>> case NFSD_IO_DONTCACHE:
> > >>>> + case NFSD_IO_DIRECT:
> > >>>> /*
> > >>>> * Must disable splice_read when enabling
> > >>>> * NFSD_IO_DONTCACHE.
> > >>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > >>>> index ea87b42894dd..bdb60ee1f1a4 100644
> > >>>> --- a/fs/nfsd/nfsd.h
> > >>>> +++ b/fs/nfsd/nfsd.h
> > >>>> @@ -157,6 +157,7 @@ enum {
> > >>>> /* Any new NFSD_IO enum value must be added at the end */
> > >>>> NFSD_IO_BUFFERED,
> > >>>> NFSD_IO_DONTCACHE,
> > >>>> + NFSD_IO_DIRECT,
> > >>>> };
> > >>>>
> > >>>> extern u64 nfsd_io_cache_read __read_mostly;
> > >>>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > >>>> index 6e2c8e2aab10..bfd41236aff2 100644
> > >>>> --- a/fs/nfsd/trace.h
> > >>>> +++ b/fs/nfsd/trace.h
> > >>>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> > >>>> DEFINE_NFSD_IO_EVENT(read_start);
> > >>>> DEFINE_NFSD_IO_EVENT(read_splice);
> > >>>> DEFINE_NFSD_IO_EVENT(read_vector);
> > >>>> +DEFINE_NFSD_IO_EVENT(read_direct);
> > >>>> DEFINE_NFSD_IO_EVENT(read_io_done);
> > >>>> DEFINE_NFSD_IO_EVENT(read_done);
> > >>>> DEFINE_NFSD_IO_EVENT(write_start);
> > >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > >>>> index 35880d3f1326..ddcd812f0761 100644
> > >>>> --- a/fs/nfsd/vfs.c
> > >>>> +++ b/fs/nfsd/vfs.c
> > >>>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >>>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> > >>>> }
> > >>>>
> > >>>> +/*
> > >>>> + * The byte range of the client's READ request is expanded on both
> > >>>> + * ends until it meets the underlying file system's direct I/O
> > >>>> + * alignment requirements. After the internal read is complete, the
> > >>>> + * byte range of the NFS READ payload is reduced to the byte range
> > >>>> + * that was originally requested.
> > >>>> + *
> > >>>> + * Note that a direct read can be done only when the xdr_buf
> > >>>> + * containing the NFS READ reply does not already have contents in
> > >>>> + * its .pages array. This is due to potentially restrictive
> > >>>> + * alignment requirements on the read buffer. When .page_len and
> > >>>> + * @base are zero, the .pages array is guaranteed to be page-
> > >>>> + * aligned.
> > >>>> + */
> > >>>
> > >>> So this ^ comment (and the related conversation with Neil in a
> > >>> different thread) says page_len should be 0 on entry to
> > >>> nfsd_direct_read.
> > >>>
> > >>>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >>>> switch (nfsd_io_cache_read) {
> > >>>> case NFSD_IO_BUFFERED:
> > >>>> break;
> > >>>> + case NFSD_IO_DIRECT:
> > >>>> + if (nf->nf_dio_read_offset_align &&
> > >>>> + rqstp->rq_res.page_len && !base)
> > >>>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> > >>>> + count, eof);
> > >>>> + fallthrough;
> > >>>
> > >>> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> > >>> rqstp->rq_res.page_len is not zero, shouldn't it be
> > >>> !rqstp->rq_res.page_len ?
> > >>
> > >> Oops, yes. I did this work last week, while out of range of my lab.
> > >>
> > >>
> > >>> (testing confirms it should be !rqstp->rq_res.page_len)
> > >>>
> > >>> Hopefully with this fix you can have more confidence in staging this
> > >>> in your nfsd-testing?
> > >> I'm waiting only for Neil to send an R-b.
> > >
> > > After noticing, like Mike, that the page_len test was inverted I went
> > > looking to see where page_len was updated, to be sure that a second READ
> > > request would not try to use DIRECT IO.
> > > I can see that nfsd_splice_actor() updates page_len, but I cannot see
> > > where it is updated when nfsd_iter_read() is used.
> > >
> > > What am I missing?
> >
> > It might be updated while the NFSv4 reply encoder is encoding a
> > COMPOUND. If the size of the RPC reply so far is larger than the
> > xdr_buf's .head, the xdr_stream will be positioned somewhere in the
> > xdr_buf's .pages array.
> >
> > This is precisely why splice READ can be used only for the first
> > NFSv4 READ operation in a COMPOUND. Subsequent READ operations
> > must use nfsd_iter_read().
Hi Neil,
> Hmmmm...
>
> nfsd4_encode_readv() calls xdr_reserve_space_vec() passing maxcount from
> the READ request. The maxcount is passed to xdr_reserve_space()
> repeatedly (one page at a time) where it is added to xdr->buf->page_len
> (where xdr is ->rq_res_stream and xdr->buf is rq_res).
>
> So the result is often that rq_res.page_len will be maxcount.
>
> Then nfsd4_encode_readv() calls nfsd_iter_read() which, with this patch,
> will test rq_res.page_len, which will always be non-zero.
> So that isn't what we want.
Not true. (And code inspection only review like this, that then makes
incorrect yet strong assertions, is pretty disruptive).
That said, I did follow along with your code inspection, I see your
concern (but you do gloss over some code in xdr_get_next_encode_buffer
that would be cause for avoidng page_len += nbytes).
I'll add some tracing to pin this down.
> (after the read, nfsd4_encode_readv() will call xdr_truncate_encode()
> which will reduce ->page_len based on how much was read).
>
> Then nfsd4_encode_readv() calls nfsd_iter_read().
>
> I don't think we can use ->page_len to determine if it is safe to use
> nfsd_direct_read(). The conditions where nfsd_direct_read() is safe are
> similar to the conditions where nfsd_splice_read() is safe. Maybe we
> should use similar code.
Your takeaway from code inspection is clearly missing something
because page_len is 0 on entry to nfsd_iter_read for both TCP and RDMA
during my READ testing.
nfs_direct_read is being called when io_cache_read is configured to
use NFSD_IO_DIRECT. So !page_len && !base isn't too limiting, it works.
Mike
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-24 14:12 ` Mike Snitzer
@ 2025-09-24 15:18 ` Mike Snitzer
2025-09-24 18:56 ` [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ"] Mike Snitzer
2025-09-26 0:18 ` [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ NeilBrown
1 sibling, 1 reply; 19+ messages in thread
From: Mike Snitzer @ 2025-09-24 15:18 UTC (permalink / raw)
To: NeilBrown
Cc: Chuck Lever, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Wed, Sep 24, 2025 at 10:12:35AM -0400, Mike Snitzer wrote:
> On Wed, Sep 24, 2025 at 05:30:38PM +1000, NeilBrown wrote:
> > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > > On 9/23/25 4:48 PM, NeilBrown wrote:
> > > > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > > >> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> > > >>> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> > > >>>> From: Chuck Lever <chuck.lever@oracle.com>
> > > >>>>
> > > >>>> Add an experimental option that forces NFS READ operations to use
> > > >>>> direct I/O instead of reading through the NFS server's page cache.
> > > >>>>
> > > >>>> There are already other layers of caching:
> > > >>>> - The page cache on NFS clients
> > > >>>> - The block device underlying the exported file system
> > > >>>>
> > > >>>> The server's page cache, in many cases, is unlikely to provide
> > > >>>> additional benefit. Some benchmarks have demonstrated that the
> > > >>>> server's page cache is actively detrimental for workloads whose
> > > >>>> working set is larger than the server's available physical memory.
> > > >>>>
> > > >>>> For instance, on small NFS servers, cached NFS file content can
> > > >>>> squeeze out local memory consumers. For large sequential workloads,
> > > >>>> an enormous amount of data flows into and out of the page cache
> > > >>>> and is consumed by NFS clients exactly once -- caching that data
> > > >>>> is expensive to do and totally valueless.
> > > >>>>
> > > >>>> For now this is a hidden option that can be enabled on test
> > > >>>> systems for benchmarking. In the longer term, this option might
> > > >>>> be enabled persistently or per-export. When the exported file
> > > >>>> system does not support direct I/O, NFSD falls back to using
> > > >>>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> > > >>>>
> > > >>>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> > > >>>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> > > >>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > >>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > >>>> ---
> > > >>>> fs/nfsd/debugfs.c | 2 ++
> > > >>>> fs/nfsd/nfsd.h | 1 +
> > > >>>> fs/nfsd/trace.h | 1 +
> > > >>>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> > > >>>> 4 files changed, 86 insertions(+)
> > > >>>>
> > > >>>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > > >>>> index ed2b9e066206..00eb1ecef6ac 100644
> > > >>>> --- a/fs/nfsd/debugfs.c
> > > >>>> +++ b/fs/nfsd/debugfs.c
> > > >>>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> > > >>>> * Contents:
> > > >>>> * %0: NFS READ will use buffered IO
> > > >>>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> > > >>>> + * %2: NFS READ will use direct IO
> > > >>>> *
> > > >>>> * This setting takes immediate effect for all NFS versions,
> > > >>>> * all exports, and in all NFSD net namespaces.
> > > >>>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> > > >>>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> > > >>>> break;
> > > >>>> case NFSD_IO_DONTCACHE:
> > > >>>> + case NFSD_IO_DIRECT:
> > > >>>> /*
> > > >>>> * Must disable splice_read when enabling
> > > >>>> * NFSD_IO_DONTCACHE.
> > > >>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > > >>>> index ea87b42894dd..bdb60ee1f1a4 100644
> > > >>>> --- a/fs/nfsd/nfsd.h
> > > >>>> +++ b/fs/nfsd/nfsd.h
> > > >>>> @@ -157,6 +157,7 @@ enum {
> > > >>>> /* Any new NFSD_IO enum value must be added at the end */
> > > >>>> NFSD_IO_BUFFERED,
> > > >>>> NFSD_IO_DONTCACHE,
> > > >>>> + NFSD_IO_DIRECT,
> > > >>>> };
> > > >>>>
> > > >>>> extern u64 nfsd_io_cache_read __read_mostly;
> > > >>>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > >>>> index 6e2c8e2aab10..bfd41236aff2 100644
> > > >>>> --- a/fs/nfsd/trace.h
> > > >>>> +++ b/fs/nfsd/trace.h
> > > >>>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> > > >>>> DEFINE_NFSD_IO_EVENT(read_start);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_splice);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_vector);
> > > >>>> +DEFINE_NFSD_IO_EVENT(read_direct);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_io_done);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_done);
> > > >>>> DEFINE_NFSD_IO_EVENT(write_start);
> > > >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > >>>> index 35880d3f1326..ddcd812f0761 100644
> > > >>>> --- a/fs/nfsd/vfs.c
> > > >>>> +++ b/fs/nfsd/vfs.c
> > > >>>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >>>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> > > >>>> }
> > > >>>>
> > > >>>> +/*
> > > >>>> + * The byte range of the client's READ request is expanded on both
> > > >>>> + * ends until it meets the underlying file system's direct I/O
> > > >>>> + * alignment requirements. After the internal read is complete, the
> > > >>>> + * byte range of the NFS READ payload is reduced to the byte range
> > > >>>> + * that was originally requested.
> > > >>>> + *
> > > >>>> + * Note that a direct read can be done only when the xdr_buf
> > > >>>> + * containing the NFS READ reply does not already have contents in
> > > >>>> + * its .pages array. This is due to potentially restrictive
> > > >>>> + * alignment requirements on the read buffer. When .page_len and
> > > >>>> + * @base are zero, the .pages array is guaranteed to be page-
> > > >>>> + * aligned.
> > > >>>> + */
> > > >>>
> > > >>> So this ^ comment (and the related conversation with Neil in a
> > > >>> different thread) says page_len should be 0 on entry to
> > > >>> nfsd_direct_read.
> > > >>>
> > > >>>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >>>> switch (nfsd_io_cache_read) {
> > > >>>> case NFSD_IO_BUFFERED:
> > > >>>> break;
> > > >>>> + case NFSD_IO_DIRECT:
> > > >>>> + if (nf->nf_dio_read_offset_align &&
> > > >>>> + rqstp->rq_res.page_len && !base)
> > > >>>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> > > >>>> + count, eof);
> > > >>>> + fallthrough;
> > > >>>
> > > >>> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> > > >>> rqstp->rq_res.page_len is not zero, shouldn't it be
> > > >>> !rqstp->rq_res.page_len ?
> > > >>
> > > >> Oops, yes. I did this work last week, while out of range of my lab.
> > > >>
> > > >>
> > > >>> (testing confirms it should be !rqstp->rq_res.page_len)
> > > >>>
> > > >>> Hopefully with this fix you can have more confidence in staging this
> > > >>> in your nfsd-testing?
> > > >> I'm waiting only for Neil to send an R-b.
> > > >
> > > > After noticing, like Mike, that the page_len test was inverted I went
> > > > looking to see where page_len was updated, to be sure that a second READ
> > > > request would not try to use DIRECT IO.
> > > > I can see that nfsd_splice_actor() updates page_len, but I cannot see
> > > > where it is updated when nfsd_iter_read() is used.
> > > >
> > > > What am I missing?
> > >
> > > It might be updated while the NFSv4 reply encoder is encoding a
> > > COMPOUND. If the size of the RPC reply so far is larger than the
> > > xdr_buf's .head, the xdr_stream will be positioned somewhere in the
> > > xdr_buf's .pages array.
> > >
> > > This is precisely why splice READ can be used only for the first
> > > NFSv4 READ operation in a COMPOUND. Subsequent READ operations
> > > must use nfsd_iter_read().
>
> Hi Neil,
>
> > Hmmmm...
> >
> > nfsd4_encode_readv() calls xdr_reserve_space_vec() passing maxcount from
> > the READ request. The maxcount is passed to xdr_reserve_space()
> > repeatedly (one page at a time) where it is added to xdr->buf->page_len
> > (where xdr is ->rq_res_stream and xdr->buf is rq_res).
> >
> > So the result is often that rq_res.page_len will be maxcount.
> >
> > Then nfsd4_encode_readv() calls nfsd_iter_read() which, with this patch,
> > will test rq_res.page_len, which will always be non-zero.
> > So that isn't what we want.
>
> Not true. (And code inspection only review like this, that then makes
> incorrect yet strong assertions, is pretty disruptive).
>
> That said, I did follow along with your code inspection, I see your
> concern (but you do gloss over some code in xdr_get_next_encode_buffer
> that would be cause for avoidng page_len += nbytes).
>
> I'll add some tracing to pin this down.
>
> > (after the read, nfsd4_encode_readv() will call xdr_truncate_encode()
> > which will reduce ->page_len based on how much was read).
> >
> > Then nfsd4_encode_readv() calls nfsd_iter_read().
> >
> > I don't think we can use ->page_len to determine if it is safe to use
> > nfsd_direct_read(). The conditions where nfsd_direct_read() is safe are
> > similar to the conditions where nfsd_splice_read() is safe. Maybe we
> > should use similar code.
>
> Your takeaway from code inspection is clearly missing something
> because page_len is 0 on entry to nfsd_iter_read for both TCP and RDMA
> during my READ testing.
>
> nfs_direct_read is being called when io_cache_read is configured to
> use NFSD_IO_DIRECT. So !page_len && !base isn't too limiting, it works.
FYI, I've been testing with NFSv3. So nfsd_read -> nfsd_iter_read
The last time I tested NFSv4 was with my DIO READ code that checked
the memory alignment with precision (albeit adding more work per IO,
which Chuck would like to avoid).
Mike
^ permalink raw reply [flat|nested] 19+ messages in thread* [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ"]
2025-09-24 15:18 ` Mike Snitzer
@ 2025-09-24 18:56 ` Mike Snitzer
2025-09-25 2:56 ` Jonathan Flynn
0 siblings, 1 reply; 19+ messages in thread
From: Mike Snitzer @ 2025-09-24 18:56 UTC (permalink / raw)
To: NeilBrown, Chuck Lever
Cc: Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
Chuck Lever
On Wed, Sep 24, 2025 at 11:18:53AM -0400, Mike Snitzer wrote:
>
> FYI, I've been testing with NFSv3. So nfsd_read -> nfsd_iter_read
>
> The last time I tested NFSv4 was with my DIO READ code that checked
> the memory alignment with precision (albeit adding more work per IO,
> which Chuck would like to avoid).
We could just enable NFSD_IO_DIRECT for NFS v3 and older. And work
out adding support for NFSv4 as follow-on work for next cycle?
This could be folded into Patch 3/3, and its header updated to be
clear that NFS v4+ isn't supported yet:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96792f8a8fc5a..459a29f377c2c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1182,10 +1182,6 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFSD_IO_BUFFERED:
break;
case NFSD_IO_DIRECT:
- if (nf->nf_dio_read_offset_align &&
- !rqstp->rq_res.page_len && !base)
- return nfsd_direct_read(rqstp, fhp, nf, offset,
- count, eof);
fallthrough;
case NFSD_IO_DONTCACHE:
if (file->f_op->fop_flags & FOP_DONTCACHE)
@@ -1622,8 +1618,14 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
file = nf->nf_file;
if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp))
err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
- else
+ else {
+ /* Use vectored reads, for NFS v3 and older */
+ if (nfsd_io_cache_read == NFSD_IO_DIRECT &&
+ nf->nf_dio_read_offset_align &&
+ !rqstp->rq_res.page_len && !base)
+ return nfsd_direct_read(rqstp, fhp, nf, offset, count, eof);
err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof);
+ }
nfsd_file_put(nf);
trace_nfsd_read_done(rqstp, fhp, offset, *count);
^ permalink raw reply related [flat|nested] 19+ messages in thread* RE: [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ"]
2025-09-24 18:56 ` [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ"] Mike Snitzer
@ 2025-09-25 2:56 ` Jonathan Flynn
2025-09-25 13:13 ` Jonathan Flynn
0 siblings, 1 reply; 19+ messages in thread
From: Jonathan Flynn @ 2025-09-25 2:56 UTC (permalink / raw)
To: Mike Snitzer, NeilBrown, Chuck Lever
Cc: Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs@vger.kernel.org, Chuck Lever
> -----Original Message-----
> From: Mike Snitzer <snitzer@kernel.org>
> Sent: Wednesday, September 24, 2025 12:56 PM
> To: NeilBrown <neilb@ownmail.net>; Chuck Lever <cel@kernel.org>
> Cc: Jeff Layton <jlayton@kernel.org>; Olga Kornievskaia
> <okorniev@redhat.com>; Dai Ngo <dai.ngo@oracle.com>; Tom Talpey
> <tom@talpey.com>; linux-nfs@vger.kernel.org; Chuck Lever
> <chuck.lever@oracle.com>
> Subject: [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
> (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT
> for NFS READ"]
>
> On Wed, Sep 24, 2025 at 11:18:53AM -0400, Mike Snitzer wrote:
> >
> > FYI, I've been testing with NFSv3. So nfsd_read -> nfsd_iter_read
> >
> > The last time I tested NFSv4 was with my DIO READ code that checked
> > the memory alignment with precision (albeit adding more work per IO,
> > which Chuck would like to avoid).
>
> We could just enable NFSD_IO_DIRECT for NFS v3 and older. And work out
> adding support for NFSv4 as follow-on work for next cycle?
>
> This could be folded into Patch 3/3, and its header updated to be clear that
> NFS v4+ isn't supported yet:
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 96792f8a8fc5a..459a29f377c2c
> 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1182,10 +1182,6 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp,
> struct svc_fh *fhp,
> case NFSD_IO_BUFFERED:
> break;
> case NFSD_IO_DIRECT:
> - if (nf->nf_dio_read_offset_align &&
> - !rqstp->rq_res.page_len && !base)
> - return nfsd_direct_read(rqstp, fhp, nf, offset,
> - count, eof);
> fallthrough;
> case NFSD_IO_DONTCACHE:
> if (file->f_op->fop_flags & FOP_DONTCACHE) @@ -1622,8
> +1618,14 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> file = nf->nf_file;
> if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp))
> err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
> - else
> + else {
> + /* Use vectored reads, for NFS v3 and older */
> + if (nfsd_io_cache_read == NFSD_IO_DIRECT &&
> + nf->nf_dio_read_offset_align &&
> + !rqstp->rq_res.page_len && !base)
> + return nfsd_direct_read(rqstp, fhp, nf, offset, count,
> eof);
> err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof);
> + }
>
> nfsd_file_put(nf);
> trace_nfsd_read_done(rqstp, fhp, offset, *count);
Hello,
I wanted to contribute some reproducible data to the discussion on
NFSD_IO_DIRECT. With a background in performance benchmarking, I ran fio-based
comparisons of buffered I/O (Mode0) and direct I/O (Mode2). The following
write-up includes the problem definition, configuration, procedure, and
results.
-------------------------------------------------------------------------------
Problem Definition
The primary motivation for NFSD_IO_DIRECT is to remove page cache contention
that reduces overall read and write performance. We have consistently seen
substantially better throughput with O_DIRECT than with buffered I/O, largely
due to avoiding contention in the page cache. While latency improvements are
not the sole goal, they serve as a clear indicator of the performance benefits.
Why dataset is much larger than memory (and why 3.125x)
- Goal: ensure the server cannot satisfy requests from page cache, forcing
steady-state disk/network behavior rather than warm-cache artifacts.
- Sizing: dataset ~= 3.125x server RAM. With 1 TB RAM and 32 jobs, this
yields ~100 GB/job (total ~= 3.2 TB). The 3.125 factor is a convenience
that produces even 100 GB files; exact factor isn’t material as long as
dataset >> RAM.
- Effect: Buffered mode under memory pressure shows elevated p95?p99.9
latencies due to page-faults, LRU churn, writeback contention, and
reclaim activity. The NFSD_IO_DIRECT read path bypasses these cache
dynamics, which is why it reduces tail latency and increases sustained
bandwidth when the system is under load.
- Concurrency mapping: 32 jobs/files ensures 2 files per NVMe
namespace/share (2 * 16 = 32), balancing load across namespaces and
preventing any single namespace from becoming a bottleneck.
(This is primarily due to my test environment being designed around
pNFS/NFSv4.2 for metadata path but uses NFSv3 for data path)
Aside: Dataset fits in memory (~81% of RAM)
As a control case, we also ran tests where the dataset was only ~832 GB, or
~81.25% of the server’s 1 TB RAM. In this in-memory regime, buffered and
direct I/O produced nearly identical results. For reads, mean latency was
1.33 ms in both modes with negligible differences across percentiles. For
writes, mean latency was 1.76 ms in both modes with only fractional
variation well within measurement noise. Bandwidth was likewise ~97?98 GB/s
for reads and ~72?73 GB/s for writes, regardless of mode.
CPU utilization during this in-memory run further illustrates the dynamics:
- Writes Mode0 ~91% total (38% system, 50% iowait, 2.6% IRQ)
- Writes Mode2 ~79% total (9.4% system, 68.4% iowait, 1.8% IRQ)
- Reads Mode0 ~52% total (50% system, 0% iowait, 1.4% IRQ)
- Reads Mode2 ~72% total (11.5% system, 59% iowait, 1.9% IRQ)
Even though throughput and latency metrics converged, the CPU profiles
differ substantially between buffered and direct modes, reflecting where
the work is absorbed (system vs iowait). These results confirm that
NFSD_IO_DIRECT does not hurt performance when the dataset can be fully
cached; its advantages only emerge once datasets exceed memory capacity
and page cache contention becomes the limiting factor.
-------------------------------------------------------------------------------
Hardware / Software Configuration
- Data Server: Supermicro SYS-121C-TN10R
* CPU: 2x Intel Xeon Gold 6542Y (24 cores/socket, 2.8 GHz)
* Memory: 1 TB RAM
* Storage: 8x ScaleFlux 7.68TB CSD5000 NVMe SSDs
(each exposing 2 namespaces, 16 total)
* Each namespace formatted with XFS, exported via pNFS v4.2
- Metadata Server: Identical platform with 8x ScaleFlux 7.68TB CSD5000
NVMe SSDs, acting as a pNFS Metadata Server (Hammerspace Anvil)
- Client: Identical platform minus local NVMe
- Data Fabric: 2x ConnectX-7 NICs, 400 GbE with RDMA (RoCE)
- OS: Rocky Linux 9.5
- Kernel: 6.12 baseline with NFSD_IO_DIRECT patches applied
- Protocol under test: NFSv4.2 was used to mount all 16 namespaces under a
single mount point via pNFS (layouts/flexfiles). However, the underlying
storage traffic exercised NFSv3 shares, so effectively the test behavior
reflects NFSv3 with NFSv4.2 coordinating access.
-------------------------------------------------------------------------------
Test Procedure
The test methodology was automated using a wrapper script
(nfsd_mode_compare_fio.sh). Defaults were used without overrides.
The procedure was:
- Compute per-job file size so the aggregate dataset is ~3.125x larger than
the memory on the NFS server (default: 1 TB → ~100G per job for 32 jobs).
- Build fio commands with:
* --numjobs=32, ensuring two files per NVMe namespace/share
(2 files × 16 namespaces = 32 total)
* --bs=1m, --iodepth=2, --ioengine=libaio
* --direct=1, --runtime=600, --ramp_time=30, --time_based
- Files created in ${MOUNT_POINT} using format testfile.$jobnum.$filenum
- Two threads per file: the second thread starts 50% into the file.
- Pass 1 (mode0): Set remote io_cache_read/io_cache_write to 0, run write
then read.
- Pause 60s between runs.
- Pass 2 (mode2): Set remote io_cache_read/io_cache_write to 2, drop remote
NFS server's page cache, rerun write and read.
- Results captured as JSON and summarized into bandwidth, latency
percentiles, and comparison tables.
-------------------------------------------------------------------------------
Results (Executive Summaries)
--- READ Executive Summary ---
Key mode0 mode2
------------------- -------------------- --------------------
Bandwidth (GB/s) 18.51 (+/-0.19) 97.86 (+/-0.01)
Mean Latency (ms) 7.21 (+/-15.30) 1.33 (+/-0.25)
Max Latency (ms) 3113.78 37.40
--- WRITE Executive Summary ---
Key mode0 mode2
------------------- -------------------- --------------------
Bandwidth (GB/s) 34.20 (+/-0.25) 72.70 (+/-0.01)
Mean Latency (ms) 3.84 (+/-6.54) 1.76 (+/-0.47)
Max Latency (ms) 199.07 25.68
CPU utilization during 3.125× dataset runs:
- Writes Mode0 ~83% total (79% system, 3.5% iowait, 0.98% IRQ)
- Writes Mode2 ~79% total (8.9% system, 68.7% iowait, 1.8% IRQ)
- Reads Mode0 ~99.5% total (97.1% system, 1.7% iowait, 0.76% IRQ)
- Reads Mode2 ~72% total (11.8% system, 58.1% iowait, 1.9% IRQ)
The improvement on reads is more than 5x in bandwidth with far lower tail
latency. Writes also show significant gains, indicating the direct path is
valuable for both workloads.
Here is a link to the full results as well as the scripts used to
generate this data, with a README describing how to reproduce the tests:
https://drive.google.com/drive/folders/1EbIAmigAv1KbE37dp2gRh_lCKIBxn2RM?usp=sharing
(I highly recommend reviewing the detailed results and the latency distributions)
These results support enabling NFSD_IO_DIRECT for NFSv3 now, as Mike
suggested. The benefits are clear and measurable. I agree that NFSv4 can
be addressed in a follow-on cycle once the alignment/validation work is
sorted out.
Happy to rerun with different parameters or workloads if that helps.
Thanks,
Jonathan Flynn
-------------------------------------------------------------------------------
Share file list:
3.125x_memory
grafana_screenshots
cpu_nfsv3_server.jpg
memory_nfsv3_server.jpg
nvme_nfsv3_server.jpg
logs
command_used.out
detailed_results.txt
fio_20250924_181954.log
fio_20250924_181954_read_mode0.json
fio_20250924_181954_read_mode2.json
fio_20250924_181954_write_mode0.json
fio_20250924_181954_write_mode2.json
what-if.out
0.8125x_memory
grafana_screenshots
cpu_nfsv3_server.jpg
memory_nfsv3_server.jpg
nvme_nfsv3_server.jpg
logs
command_used.out
detailed_results.txt
fio_20250924_230010.log
fio_20250924_230010_read_mode0.json
fio_20250924_230010_read_mode2.json
fio_20250924_230010_write_mode0.json
fio_20250924_230010_write_mode2.json
what-if.out
analyze_fio_json.py
nfsd_mode_compare_fio.sh
README
^ permalink raw reply [flat|nested] 19+ messages in thread* RE: [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ"]
2025-09-25 2:56 ` Jonathan Flynn
@ 2025-09-25 13:13 ` Jonathan Flynn
0 siblings, 0 replies; 19+ messages in thread
From: Jonathan Flynn @ 2025-09-25 13:13 UTC (permalink / raw)
To: Mike Snitzer, NeilBrown, Chuck Lever
Cc: Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs@vger.kernel.org, Chuck Lever
> -----Original Message-----
> From: Jonathan Flynn
> Sent: Wednesday, September 24, 2025 8:56 PM
> To: 'Mike Snitzer' <snitzer@kernel.org>; NeilBrown <neilb@ownmail.net>;
> Chuck Lever <cel@kernel.org>
> Cc: Jeff Layton <jlayton@kernel.org>; Olga Kornievskaia
> <okorniev@redhat.com>; Dai Ngo <dai.ngo@oracle.com>; Tom Talpey
> <tom@talpey.com>; linux-nfs@vger.kernel.org; Chuck Lever
> <chuck.lever@oracle.com>
> Subject: RE: [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS
> READ (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement
> NFSD_IO_DIRECT for NFS READ"]
>
> > -----Original Message-----
> > From: Mike Snitzer <snitzer@kernel.org>
> > Sent: Wednesday, September 24, 2025 12:56 PM
> > To: NeilBrown <neilb@ownmail.net>; Chuck Lever <cel@kernel.org>
> > Cc: Jeff Layton <jlayton@kernel.org>; Olga Kornievskaia
> > <okorniev@redhat.com>; Dai Ngo <dai.ngo@oracle.com>; Tom Talpey
> > <tom@talpey.com>; linux-nfs@vger.kernel.org; Chuck Lever
> > <chuck.lever@oracle.com>
> > Subject: [RFC PATCH v3 4/3] NFSD: Implement NFSD_IO_DIRECT for NFS
> > READ
> > (v3 and older) [Was: "Re: [PATCH v3 3/3] NFSD: Implement
> > NFSD_IO_DIRECT for NFS READ"]
> >
> > On Wed, Sep 24, 2025 at 11:18:53AM -0400, Mike Snitzer wrote:
> > >
> > > FYI, I've been testing with NFSv3. So nfsd_read -> nfsd_iter_read
> > >
> > > The last time I tested NFSv4 was with my DIO READ code that checked
> > > the memory alignment with precision (albeit adding more work per IO,
> > > which Chuck would like to avoid).
> >
> > We could just enable NFSD_IO_DIRECT for NFS v3 and older. And work out
> > adding support for NFSv4 as follow-on work for next cycle?
> >
> > This could be folded into Patch 3/3, and its header updated to be
> > clear that NFS v4+ isn't supported yet:
> >
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index
> > 96792f8a8fc5a..459a29f377c2c
> > 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -1182,10 +1182,6 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp,
> > struct svc_fh *fhp,
> > case NFSD_IO_BUFFERED:
> > break;
> > case NFSD_IO_DIRECT:
> > - if (nf->nf_dio_read_offset_align &&
> > - !rqstp->rq_res.page_len && !base)
> > - return nfsd_direct_read(rqstp, fhp, nf, offset,
> > - count, eof);
> > fallthrough;
> > case NFSD_IO_DONTCACHE:
> > if (file->f_op->fop_flags & FOP_DONTCACHE) @@ -1622,8
> > +1618,14 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh
> > +*fhp,
> > file = nf->nf_file;
> > if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp))
> > err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
> > - else
> > + else {
> > + /* Use vectored reads, for NFS v3 and older */
> > + if (nfsd_io_cache_read == NFSD_IO_DIRECT &&
> > + nf->nf_dio_read_offset_align &&
> > + !rqstp->rq_res.page_len && !base)
> > + return nfsd_direct_read(rqstp, fhp, nf, offset, count,
> > eof);
> > err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof);
> > + }
> >
> > nfsd_file_put(nf);
> > trace_nfsd_read_done(rqstp, fhp, offset, *count);
> Hello,
>
> I wanted to contribute some reproducible data to the discussion on
> NFSD_IO_DIRECT. With a background in performance benchmarking, I ran fio-
> based comparisons of buffered I/O (Mode0) and direct I/O (Mode2). The
> following write-up includes the problem definition, configuration, procedure,
> and results.
>
> -------------------------------------------------------------------------------
> Problem Definition
> The primary motivation for NFSD_IO_DIRECT is to remove page cache
> contention that reduces overall read and write performance. We have
> consistently seen substantially better throughput with O_DIRECT than with
> buffered I/O, largely due to avoiding contention in the page cache. While
> latency improvements are not the sole goal, they serve as a clear indicator of
> the performance benefits.
>
> Why dataset is much larger than memory (and why 3.125x)
> - Goal: ensure the server cannot satisfy requests from page cache, forcing
> steady-state disk/network behavior rather than warm-cache artifacts.
> - Sizing: dataset ~= 3.125x server RAM. With 1 TB RAM and 32 jobs, this
> yields ~100 GB/job (total ~= 3.2 TB). The 3.125 factor is a convenience
> that produces even 100 GB files; exact factor isn’t material as long as
> dataset >> RAM.
> - Effect: Buffered mode under memory pressure shows elevated p95?p99.9
> latencies due to page-faults, LRU churn, writeback contention, and
> reclaim activity. The NFSD_IO_DIRECT read path bypasses these cache
> dynamics, which is why it reduces tail latency and increases sustained
> bandwidth when the system is under load.
> - Concurrency mapping: 32 jobs/files ensures 2 files per NVMe
> namespace/share (2 * 16 = 32), balancing load across namespaces and
> preventing any single namespace from becoming a bottleneck.
> (This is primarily due to my test environment being designed around
> pNFS/NFSv4.2 for metadata path but uses NFSv3 for data path)
>
> Aside: Dataset fits in memory (~81% of RAM)
> As a control case, we also ran tests where the dataset was only ~832 GB, or
> ~81.25% of the server’s 1 TB RAM. In this in-memory regime, buffered and
> direct I/O produced nearly identical results. For reads, mean latency was
> 1.33 ms in both modes with negligible differences across percentiles. For
> writes, mean latency was 1.76 ms in both modes with only fractional
> variation well within measurement noise. Bandwidth was likewise ~97?98
> GB/s
> for reads and ~72?73 GB/s for writes, regardless of mode.
>
> CPU utilization during this in-memory run further illustrates the dynamics:
> - Writes Mode0 ~91% total (38% system, 50% iowait, 2.6% IRQ)
> - Writes Mode2 ~79% total (9.4% system, 68.4% iowait, 1.8% IRQ)
> - Reads Mode0 ~52% total (50% system, 0% iowait, 1.4% IRQ)
> - Reads Mode2 ~72% total (11.5% system, 59% iowait, 1.9% IRQ)
>
> Even though throughput and latency metrics converged, the CPU profiles
> differ substantially between buffered and direct modes, reflecting where
> the work is absorbed (system vs iowait). These results confirm that
> NFSD_IO_DIRECT does not hurt performance when the dataset can be fully
> cached; its advantages only emerge once datasets exceed memory capacity
> and page cache contention becomes the limiting factor.
>
> -------------------------------------------------------------------------------
> Hardware / Software Configuration
> - Data Server: Supermicro SYS-121C-TN10R
> * CPU: 2x Intel Xeon Gold 6542Y (24 cores/socket, 2.8 GHz)
> * Memory: 1 TB RAM
> * Storage: 8x ScaleFlux 7.68TB CSD5000 NVMe SSDs
> (each exposing 2 namespaces, 16 total)
> * Each namespace formatted with XFS, exported via pNFS v4.2
> - Metadata Server: Identical platform with 8x ScaleFlux 7.68TB CSD5000
> NVMe SSDs, acting as a pNFS Metadata Server (Hammerspace Anvil)
> - Client: Identical platform minus local NVMe
> - Data Fabric: 2x ConnectX-7 NICs, 400 GbE with RDMA (RoCE)
> - OS: Rocky Linux 9.5
> - Kernel: 6.12 baseline with NFSD_IO_DIRECT patches applied
> - Protocol under test: NFSv4.2 was used to mount all 16 namespaces under
> a
> single mount point via pNFS (layouts/flexfiles). However, the underlying
> storage traffic exercised NFSv3 shares, so effectively the test behavior
> reflects NFSv3 with NFSv4.2 coordinating access.
>
> -------------------------------------------------------------------------------
> Test Procedure
> The test methodology was automated using a wrapper script
> (nfsd_mode_compare_fio.sh). Defaults were used without overrides.
> The procedure was:
> - Compute per-job file size so the aggregate dataset is ~3.125x larger than
> the memory on the NFS server (default: 1 TB → ~100G per job for 32 jobs).
> - Build fio commands with:
> * --numjobs=32, ensuring two files per NVMe namespace/share
> (2 files × 16 namespaces = 32 total)
> * --bs=1m, --iodepth=2, --ioengine=libaio
> * --direct=1, --runtime=600, --ramp_time=30, --time_based
> - Files created in ${MOUNT_POINT} using format testfile.$jobnum.$filenum
> - Two threads per file: the second thread starts 50% into the file.
> - Pass 1 (mode0): Set remote io_cache_read/io_cache_write to 0, run write
> then read.
> - Pause 60s between runs.
> - Pass 2 (mode2): Set remote io_cache_read/io_cache_write to 2, drop
> remote
> NFS server's page cache, rerun write and read.
> - Results captured as JSON and summarized into bandwidth, latency
> percentiles, and comparison tables.
>
> -------------------------------------------------------------------------------
> Results (Executive Summaries)
>
> --- READ Executive Summary ---
> Key mode0 mode2
> ------------------- -------------------- --------------------
> Bandwidth (GB/s) 18.51 (+/-0.19) 97.86 (+/-0.01)
> Mean Latency (ms) 7.21 (+/-15.30) 1.33 (+/-0.25)
> Max Latency (ms) 3113.78 37.40
>
> --- WRITE Executive Summary ---
> Key mode0 mode2
> ------------------- -------------------- --------------------
> Bandwidth (GB/s) 34.20 (+/-0.25) 72.70 (+/-0.01)
> Mean Latency (ms) 3.84 (+/-6.54) 1.76 (+/-0.47)
> Max Latency (ms) 199.07 25.68
>
> CPU utilization during 3.125× dataset runs:
> - Writes Mode0 ~83% total (79% system, 3.5% iowait, 0.98% IRQ)
> - Writes Mode2 ~79% total (8.9% system, 68.7% iowait, 1.8% IRQ)
> - Reads Mode0 ~99.5% total (97.1% system, 1.7% iowait, 0.76% IRQ)
> - Reads Mode2 ~72% total (11.8% system, 58.1% iowait, 1.9% IRQ)
>
> The improvement on reads is more than 5x in bandwidth with far lower tail
> latency. Writes also show significant gains, indicating the direct path is
> valuable for both workloads.
>
> Here is a link to the full results as well as the scripts used to generate this
> data, with a README describing how to reproduce the tests:
> https://drive.google.com/drive/folders/1EbIAmigAv1KbE37dp2gRh_lCKIBxn2R
> M?usp=sharing
> (I highly recommend reviewing the detailed results and the latency
> distributions)
>
> These results support enabling NFSD_IO_DIRECT for NFSv3 now, as Mike
> suggested. The benefits are clear and measurable. I agree that NFSv4 can be
> addressed in a follow-on cycle once the alignment/validation work is sorted
> out.
>
> Happy to rerun with different parameters or workloads if that helps.
>
> Thanks,
> Jonathan Flynn
>
>
> -------------------------------------------------------------------------------
> Share file list:
> 3.125x_memory
> grafana_screenshots
> cpu_nfsv3_server.jpg
> memory_nfsv3_server.jpg
> nvme_nfsv3_server.jpg
> logs
> command_used.out
> detailed_results.txt
> fio_20250924_181954.log
> fio_20250924_181954_read_mode0.json
> fio_20250924_181954_read_mode2.json
> fio_20250924_181954_write_mode0.json
> fio_20250924_181954_write_mode2.json
> what-if.out
>
> 0.8125x_memory
> grafana_screenshots
> cpu_nfsv3_server.jpg
> memory_nfsv3_server.jpg
> nvme_nfsv3_server.jpg
> logs
> command_used.out
> detailed_results.txt
> fio_20250924_230010.log
> fio_20250924_230010_read_mode0.json
> fio_20250924_230010_read_mode2.json
> fio_20250924_230010_write_mode0.json
> fio_20250924_230010_write_mode2.json
> what-if.out
>
> analyze_fio_json.py
> nfsd_mode_compare_fio.sh
> README
Hello,
Following my earlier note, it became clear that one clarification is needed regarding how CPU utilization is represented in the results.
In the initial message, I reported total CPU utilization as the sum of system, iowait, and IRQ percentages. However, iowait is not an active consumer of CPU cycles; it represents time spent waiting on I/O completion. To better reflect the effective CPU resources consumed, the comparison below excludes iowait from the totals while still showing it for context:
- Writes Mode0 ~40.6% effective (38% system, 50% iowait, 2.6% IRQ)
- Writes Mode2 ~11.2% effective (9.4% system, 68.4% iowait, 1.8% IRQ)
- Reads Mode0 ~51.4% effective (50% system, 0% iowait, 1.4% IRQ)
- Reads Mode2 ~13.4% effective (11.5% system, 59% iowait, 1.9% IRQ)
This highlights that buffered I/O (Mode0) and direct I/O (Mode2) deliver essentially identical performance and latency when the dataset fits within memory, but NFSD_IO_DIRECT achieves this with far less active CPU utilization. The large share of time moved into iowait indicates the CPU is freed up from kernel bookkeeping and contention.
That freed CPU capacity could be leveraged for other data management tasks outside the direct data path ? for example, compression, encryption, or security scanning jobs that operate on stored data. The point is not that these functions must be inline with reads/writes, but that reduced CPU load during I/O creates more headroom for additional services to run concurrently on the same infrastructure.
I hope this clarification is helpful for interpreting the results.
Thanks,
Jonathan Flynn
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-24 14:12 ` Mike Snitzer
2025-09-24 15:18 ` Mike Snitzer
@ 2025-09-26 0:18 ` NeilBrown
2025-09-26 1:30 ` Mike Snitzer
1 sibling, 1 reply; 19+ messages in thread
From: NeilBrown @ 2025-09-26 0:18 UTC (permalink / raw)
To: Mike Snitzer
Cc: Chuck Lever, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Thu, 25 Sep 2025, Mike Snitzer wrote:
> On Wed, Sep 24, 2025 at 05:30:38PM +1000, NeilBrown wrote:
> > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > > On 9/23/25 4:48 PM, NeilBrown wrote:
> > > > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > > >> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> > > >>> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> > > >>>> From: Chuck Lever <chuck.lever@oracle.com>
> > > >>>>
> > > >>>> Add an experimental option that forces NFS READ operations to use
> > > >>>> direct I/O instead of reading through the NFS server's page cache.
> > > >>>>
> > > >>>> There are already other layers of caching:
> > > >>>> - The page cache on NFS clients
> > > >>>> - The block device underlying the exported file system
> > > >>>>
> > > >>>> The server's page cache, in many cases, is unlikely to provide
> > > >>>> additional benefit. Some benchmarks have demonstrated that the
> > > >>>> server's page cache is actively detrimental for workloads whose
> > > >>>> working set is larger than the server's available physical memory.
> > > >>>>
> > > >>>> For instance, on small NFS servers, cached NFS file content can
> > > >>>> squeeze out local memory consumers. For large sequential workloads,
> > > >>>> an enormous amount of data flows into and out of the page cache
> > > >>>> and is consumed by NFS clients exactly once -- caching that data
> > > >>>> is expensive to do and totally valueless.
> > > >>>>
> > > >>>> For now this is a hidden option that can be enabled on test
> > > >>>> systems for benchmarking. In the longer term, this option might
> > > >>>> be enabled persistently or per-export. When the exported file
> > > >>>> system does not support direct I/O, NFSD falls back to using
> > > >>>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> > > >>>>
> > > >>>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> > > >>>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> > > >>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > >>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > >>>> ---
> > > >>>> fs/nfsd/debugfs.c | 2 ++
> > > >>>> fs/nfsd/nfsd.h | 1 +
> > > >>>> fs/nfsd/trace.h | 1 +
> > > >>>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> > > >>>> 4 files changed, 86 insertions(+)
> > > >>>>
> > > >>>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > > >>>> index ed2b9e066206..00eb1ecef6ac 100644
> > > >>>> --- a/fs/nfsd/debugfs.c
> > > >>>> +++ b/fs/nfsd/debugfs.c
> > > >>>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> > > >>>> * Contents:
> > > >>>> * %0: NFS READ will use buffered IO
> > > >>>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> > > >>>> + * %2: NFS READ will use direct IO
> > > >>>> *
> > > >>>> * This setting takes immediate effect for all NFS versions,
> > > >>>> * all exports, and in all NFSD net namespaces.
> > > >>>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> > > >>>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> > > >>>> break;
> > > >>>> case NFSD_IO_DONTCACHE:
> > > >>>> + case NFSD_IO_DIRECT:
> > > >>>> /*
> > > >>>> * Must disable splice_read when enabling
> > > >>>> * NFSD_IO_DONTCACHE.
> > > >>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > > >>>> index ea87b42894dd..bdb60ee1f1a4 100644
> > > >>>> --- a/fs/nfsd/nfsd.h
> > > >>>> +++ b/fs/nfsd/nfsd.h
> > > >>>> @@ -157,6 +157,7 @@ enum {
> > > >>>> /* Any new NFSD_IO enum value must be added at the end */
> > > >>>> NFSD_IO_BUFFERED,
> > > >>>> NFSD_IO_DONTCACHE,
> > > >>>> + NFSD_IO_DIRECT,
> > > >>>> };
> > > >>>>
> > > >>>> extern u64 nfsd_io_cache_read __read_mostly;
> > > >>>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > >>>> index 6e2c8e2aab10..bfd41236aff2 100644
> > > >>>> --- a/fs/nfsd/trace.h
> > > >>>> +++ b/fs/nfsd/trace.h
> > > >>>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> > > >>>> DEFINE_NFSD_IO_EVENT(read_start);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_splice);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_vector);
> > > >>>> +DEFINE_NFSD_IO_EVENT(read_direct);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_io_done);
> > > >>>> DEFINE_NFSD_IO_EVENT(read_done);
> > > >>>> DEFINE_NFSD_IO_EVENT(write_start);
> > > >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > >>>> index 35880d3f1326..ddcd812f0761 100644
> > > >>>> --- a/fs/nfsd/vfs.c
> > > >>>> +++ b/fs/nfsd/vfs.c
> > > >>>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >>>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> > > >>>> }
> > > >>>>
> > > >>>> +/*
> > > >>>> + * The byte range of the client's READ request is expanded on both
> > > >>>> + * ends until it meets the underlying file system's direct I/O
> > > >>>> + * alignment requirements. After the internal read is complete, the
> > > >>>> + * byte range of the NFS READ payload is reduced to the byte range
> > > >>>> + * that was originally requested.
> > > >>>> + *
> > > >>>> + * Note that a direct read can be done only when the xdr_buf
> > > >>>> + * containing the NFS READ reply does not already have contents in
> > > >>>> + * its .pages array. This is due to potentially restrictive
> > > >>>> + * alignment requirements on the read buffer. When .page_len and
> > > >>>> + * @base are zero, the .pages array is guaranteed to be page-
> > > >>>> + * aligned.
> > > >>>> + */
> > > >>>
> > > >>> So this ^ comment (and the related conversation with Neil in a
> > > >>> different thread) says page_len should be 0 on entry to
> > > >>> nfsd_direct_read.
> > > >>>
> > > >>>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >>>> switch (nfsd_io_cache_read) {
> > > >>>> case NFSD_IO_BUFFERED:
> > > >>>> break;
> > > >>>> + case NFSD_IO_DIRECT:
> > > >>>> + if (nf->nf_dio_read_offset_align &&
> > > >>>> + rqstp->rq_res.page_len && !base)
> > > >>>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> > > >>>> + count, eof);
> > > >>>> + fallthrough;
> > > >>>
> > > >>> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> > > >>> rqstp->rq_res.page_len is not zero, shouldn't it be
> > > >>> !rqstp->rq_res.page_len ?
> > > >>
> > > >> Oops, yes. I did this work last week, while out of range of my lab.
> > > >>
> > > >>
> > > >>> (testing confirms it should be !rqstp->rq_res.page_len)
> > > >>>
> > > >>> Hopefully with this fix you can have more confidence in staging this
> > > >>> in your nfsd-testing?
> > > >> I'm waiting only for Neil to send an R-b.
> > > >
> > > > After noticing, like Mike, that the page_len test was inverted I went
> > > > looking to see where page_len was updated, to be sure that a second READ
> > > > request would not try to use DIRECT IO.
> > > > I can see that nfsd_splice_actor() updates page_len, but I cannot see
> > > > where it is updated when nfsd_iter_read() is used.
> > > >
> > > > What am I missing?
> > >
> > > It might be updated while the NFSv4 reply encoder is encoding a
> > > COMPOUND. If the size of the RPC reply so far is larger than the
> > > xdr_buf's .head, the xdr_stream will be positioned somewhere in the
> > > xdr_buf's .pages array.
> > >
> > > This is precisely why splice READ can be used only for the first
> > > NFSv4 READ operation in a COMPOUND. Subsequent READ operations
> > > must use nfsd_iter_read().
>
> Hi Neil,
>
> > Hmmmm...
> >
> > nfsd4_encode_readv() calls xdr_reserve_space_vec() passing maxcount from
> > the READ request. The maxcount is passed to xdr_reserve_space()
> > repeatedly (one page at a time) where it is added to xdr->buf->page_len
> > (where xdr is ->rq_res_stream and xdr->buf is rq_res).
> >
> > So the result is often that rq_res.page_len will be maxcount.
> >
> > Then nfsd4_encode_readv() calls nfsd_iter_read() which, with this patch,
> > will test rq_res.page_len, which will always be non-zero.
> > So that isn't what we want.
>
> Not true. (And code inspection only review like this, that then makes
> incorrect yet strong assertions, is pretty disruptive).
What am I disrupting exactly?
I'm (implicitly?) asked to review the patch. I do that by inspecting
the code. I'm not NAKing the code, I'm saying that I cannot see how it
could work. I'd be very happy for someone to explain how it does work.
We could then put that text in the commit message.
>
> That said, I did follow along with your code inspection, I see your
> concern (but you do gloss over some code in xdr_get_next_encode_buffer
> that would be cause for avoidng page_len += nbytes).
>
> I'll add some tracing to pin this down.
>
> > (after the read, nfsd4_encode_readv() will call xdr_truncate_encode()
> > which will reduce ->page_len based on how much was read).
> >
> > Then nfsd4_encode_readv() calls nfsd_iter_read().
> >
> > I don't think we can use ->page_len to determine if it is safe to use
> > nfsd_direct_read(). The conditions where nfsd_direct_read() is safe are
> > similar to the conditions where nfsd_splice_read() is safe. Maybe we
> > should use similar code.
>
> Your takeaway from code inspection is clearly missing something
> because page_len is 0 on entry to nfsd_iter_read for both TCP and RDMA
> during my READ testing.
Great. I'm glad it works. But without understanding why it works, I
cannot give a Reviewed-By. Maybe it just works by accident (I've seen
that happen before). And if you or Chuck cannot explain why it works
then we clearly have work to do.
NeilBrown
>
> nfs_direct_read is being called when io_cache_read is configured to
> use NFSD_IO_DIRECT. So !page_len && !base isn't too limiting, it works.
>
> Mike
>
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-26 0:18 ` [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ NeilBrown
@ 2025-09-26 1:30 ` Mike Snitzer
0 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2025-09-26 1:30 UTC (permalink / raw)
To: NeilBrown
Cc: Chuck Lever, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On Fri, Sep 26, 2025 at 10:18:40AM +1000, NeilBrown wrote:
> On Thu, 25 Sep 2025, Mike Snitzer wrote:
> > On Wed, Sep 24, 2025 at 05:30:38PM +1000, NeilBrown wrote:
> > > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > > > On 9/23/25 4:48 PM, NeilBrown wrote:
> > > > > On Wed, 24 Sep 2025, Chuck Lever wrote:
> > > > >> On 9/23/25 3:26 PM, Mike Snitzer wrote:
> > > > >>> On Mon, Sep 22, 2025 at 10:11:37AM -0400, Chuck Lever wrote:
> > > > >>>> From: Chuck Lever <chuck.lever@oracle.com>
> > > > >>>>
> > > > >>>> Add an experimental option that forces NFS READ operations to use
> > > > >>>> direct I/O instead of reading through the NFS server's page cache.
> > > > >>>>
> > > > >>>> There are already other layers of caching:
> > > > >>>> - The page cache on NFS clients
> > > > >>>> - The block device underlying the exported file system
> > > > >>>>
> > > > >>>> The server's page cache, in many cases, is unlikely to provide
> > > > >>>> additional benefit. Some benchmarks have demonstrated that the
> > > > >>>> server's page cache is actively detrimental for workloads whose
> > > > >>>> working set is larger than the server's available physical memory.
> > > > >>>>
> > > > >>>> For instance, on small NFS servers, cached NFS file content can
> > > > >>>> squeeze out local memory consumers. For large sequential workloads,
> > > > >>>> an enormous amount of data flows into and out of the page cache
> > > > >>>> and is consumed by NFS clients exactly once -- caching that data
> > > > >>>> is expensive to do and totally valueless.
> > > > >>>>
> > > > >>>> For now this is a hidden option that can be enabled on test
> > > > >>>> systems for benchmarking. In the longer term, this option might
> > > > >>>> be enabled persistently or per-export. When the exported file
> > > > >>>> system does not support direct I/O, NFSD falls back to using
> > > > >>>> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> > > > >>>>
> > > > >>>> Suggested-by: Mike Snitzer <snitzer@kernel.org>
> > > > >>>> Reviewed-by: Mike Snitzer <snitzer@kernel.org>
> > > > >>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > >>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > >>>> ---
> > > > >>>> fs/nfsd/debugfs.c | 2 ++
> > > > >>>> fs/nfsd/nfsd.h | 1 +
> > > > >>>> fs/nfsd/trace.h | 1 +
> > > > >>>> fs/nfsd/vfs.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
> > > > >>>> 4 files changed, 86 insertions(+)
> > > > >>>>
> > > > >>>> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > > > >>>> index ed2b9e066206..00eb1ecef6ac 100644
> > > > >>>> --- a/fs/nfsd/debugfs.c
> > > > >>>> +++ b/fs/nfsd/debugfs.c
> > > > >>>> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> > > > >>>> * Contents:
> > > > >>>> * %0: NFS READ will use buffered IO
> > > > >>>> * %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> > > > >>>> + * %2: NFS READ will use direct IO
> > > > >>>> *
> > > > >>>> * This setting takes immediate effect for all NFS versions,
> > > > >>>> * all exports, and in all NFSD net namespaces.
> > > > >>>> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> > > > >>>> nfsd_io_cache_read = NFSD_IO_BUFFERED;
> > > > >>>> break;
> > > > >>>> case NFSD_IO_DONTCACHE:
> > > > >>>> + case NFSD_IO_DIRECT:
> > > > >>>> /*
> > > > >>>> * Must disable splice_read when enabling
> > > > >>>> * NFSD_IO_DONTCACHE.
> > > > >>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > > > >>>> index ea87b42894dd..bdb60ee1f1a4 100644
> > > > >>>> --- a/fs/nfsd/nfsd.h
> > > > >>>> +++ b/fs/nfsd/nfsd.h
> > > > >>>> @@ -157,6 +157,7 @@ enum {
> > > > >>>> /* Any new NFSD_IO enum value must be added at the end */
> > > > >>>> NFSD_IO_BUFFERED,
> > > > >>>> NFSD_IO_DONTCACHE,
> > > > >>>> + NFSD_IO_DIRECT,
> > > > >>>> };
> > > > >>>>
> > > > >>>> extern u64 nfsd_io_cache_read __read_mostly;
> > > > >>>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > > >>>> index 6e2c8e2aab10..bfd41236aff2 100644
> > > > >>>> --- a/fs/nfsd/trace.h
> > > > >>>> +++ b/fs/nfsd/trace.h
> > > > >>>> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
> > > > >>>> DEFINE_NFSD_IO_EVENT(read_start);
> > > > >>>> DEFINE_NFSD_IO_EVENT(read_splice);
> > > > >>>> DEFINE_NFSD_IO_EVENT(read_vector);
> > > > >>>> +DEFINE_NFSD_IO_EVENT(read_direct);
> > > > >>>> DEFINE_NFSD_IO_EVENT(read_io_done);
> > > > >>>> DEFINE_NFSD_IO_EVENT(read_done);
> > > > >>>> DEFINE_NFSD_IO_EVENT(write_start);
> > > > >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > >>>> index 35880d3f1326..ddcd812f0761 100644
> > > > >>>> --- a/fs/nfsd/vfs.c
> > > > >>>> +++ b/fs/nfsd/vfs.c
> > > > >>>> @@ -1074,6 +1074,82 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > >>>> return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> > > > >>>> }
> > > > >>>>
> > > > >>>> +/*
> > > > >>>> + * The byte range of the client's READ request is expanded on both
> > > > >>>> + * ends until it meets the underlying file system's direct I/O
> > > > >>>> + * alignment requirements. After the internal read is complete, the
> > > > >>>> + * byte range of the NFS READ payload is reduced to the byte range
> > > > >>>> + * that was originally requested.
> > > > >>>> + *
> > > > >>>> + * Note that a direct read can be done only when the xdr_buf
> > > > >>>> + * containing the NFS READ reply does not already have contents in
> > > > >>>> + * its .pages array. This is due to potentially restrictive
> > > > >>>> + * alignment requirements on the read buffer. When .page_len and
> > > > >>>> + * @base are zero, the .pages array is guaranteed to be page-
> > > > >>>> + * aligned.
> > > > >>>> + */
> > > > >>>
> > > > >>> So this ^ comment (and the related conversation with Neil in a
> > > > >>> different thread) says page_len should be 0 on entry to
> > > > >>> nfsd_direct_read.
> > > > >>>
> > > > >>>> @@ -1106,6 +1182,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > > >>>> switch (nfsd_io_cache_read) {
> > > > >>>> case NFSD_IO_BUFFERED:
> > > > >>>> break;
> > > > >>>> + case NFSD_IO_DIRECT:
> > > > >>>> + if (nf->nf_dio_read_offset_align &&
> > > > >>>> + rqstp->rq_res.page_len && !base)
> > > > >>>> + return nfsd_direct_read(rqstp, fhp, nf, offset,
> > > > >>>> + count, eof);
> > > > >>>> + fallthrough;
> > > > >>>
> > > > >>> Yet the nfsd_iter_read is only calling nfsd_direct_read() if
> > > > >>> rqstp->rq_res.page_len is not zero, shouldn't it be
> > > > >>> !rqstp->rq_res.page_len ?
> > > > >>
> > > > >> Oops, yes. I did this work last week, while out of range of my lab.
> > > > >>
> > > > >>
> > > > >>> (testing confirms it should be !rqstp->rq_res.page_len)
> > > > >>>
> > > > >>> Hopefully with this fix you can have more confidence in staging this
> > > > >>> in your nfsd-testing?
> > > > >> I'm waiting only for Neil to send an R-b.
> > > > >
> > > > > After noticing, like Mike, that the page_len test was inverted I went
> > > > > looking to see where page_len was updated, to be sure that a second READ
> > > > > request would not try to use DIRECT IO.
> > > > > I can see that nfsd_splice_actor() updates page_len, but I cannot see
> > > > > where it is updated when nfsd_iter_read() is used.
> > > > >
> > > > > What am I missing?
> > > >
> > > > It might be updated while the NFSv4 reply encoder is encoding a
> > > > COMPOUND. If the size of the RPC reply so far is larger than the
> > > > xdr_buf's .head, the xdr_stream will be positioned somewhere in the
> > > > xdr_buf's .pages array.
> > > >
> > > > This is precisely why splice READ can be used only for the first
> > > > NFSv4 READ operation in a COMPOUND. Subsequent READ operations
> > > > must use nfsd_iter_read().
> >
> > Hi Neil,
> >
> > > Hmmmm...
> > >
> > > nfsd4_encode_readv() calls xdr_reserve_space_vec() passing maxcount from
> > > the READ request. The maxcount is passed to xdr_reserve_space()
> > > repeatedly (one page at a time) where it is added to xdr->buf->page_len
> > > (where xdr is ->rq_res_stream and xdr->buf is rq_res).
> > >
> > > So the result is often that rq_res.page_len will be maxcount.
> > >
> > > Then nfsd4_encode_readv() calls nfsd_iter_read() which, with this patch,
> > > will test rq_res.page_len, which will always be non-zero.
> > > So that isn't what we want.
> >
> > Not true. (And code inspection only review like this, that then makes
> > incorrect yet strong assertions, is pretty disruptive).
>
> What am I disrupting exactly?
>
> I'm (implicitly?) asked to review the patch. I do that by inspecting
> the code. I'm not NAKing the code, I'm saying that I cannot see how it
> could work. I'd be very happy for someone to explain how it does work.
> We could then put that text in the commit message.
Your review found that verifying page_len to be 0 is inadequate for NFS v4+.
I had been testing/using NFS v3.
Your review is always helpful! (I held it to be less than that in the
moment because it had to be missing something.. that something was v3
vs v4 usage. We both had partial info that left us each confused).
Anyway, apologies.
> > That said, I did follow along with your code inspection, I see your
> > concern (but you do gloss over some code in xdr_get_next_encode_buffer
> > that would be cause for avoidng page_len += nbytes).
> >
> > I'll add some tracing to pin this down.
> >
> > > (after the read, nfsd4_encode_readv() will call xdr_truncate_encode()
> > > which will reduce ->page_len based on how much was read).
> > >
> > > Then nfsd4_encode_readv() calls nfsd_iter_read().
> > >
> > > I don't think we can use ->page_len to determine if it is safe to use
> > > nfsd_direct_read(). The conditions where nfsd_direct_read() is safe are
> > > similar to the conditions where nfsd_splice_read() is safe. Maybe we
> > > should use similar code.
> >
> > Your takeaway from code inspection is clearly missing something
> > because page_len is 0 on entry to nfsd_iter_read for both TCP and RDMA
> > during my READ testing.
>
> Great. I'm glad it works. But without understanding why it works, I
> cannot give a Reviewed-By. Maybe it just works by accident (I've seen
> that happen before). And if you or Chuck cannot explain why it works
> then we clearly have work to do.
My follow-up replies clarified it worked because I was using NFSv3:
https://lore.kernel.org/linux-nfs/aNQ-1OSG9Ti5XbUm@kernel.org/
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ
2025-09-24 7:30 ` NeilBrown
2025-09-24 14:12 ` Mike Snitzer
@ 2025-09-24 18:56 ` Chuck Lever
1 sibling, 0 replies; 19+ messages in thread
From: Chuck Lever @ 2025-09-24 18:56 UTC (permalink / raw)
To: NeilBrown
Cc: Mike Snitzer, Jeff Layton, Olga Kornievskaia, Dai Ngo, Tom Talpey,
linux-nfs, Chuck Lever
On 9/24/25 12:30 AM, NeilBrown wrote:
> nfsd4_encode_readv() calls xdr_reserve_space_vec() passing maxcount from
> the READ request. The maxcount is passed to xdr_reserve_space()
> repeatedly (one page at a time) where it is added to xdr->buf->page_len
> (where xdr is ->rq_res_stream and xdr->buf is rq_res).
>
> So the result is often that rq_res.page_len will be maxcount.
>
> Then nfsd4_encode_readv() calls nfsd_iter_read() which, with this patch,
> will test rq_res.page_len, which will always be non-zero.
> So that isn't what we want.
>
> (after the read, nfsd4_encode_readv() will call xdr_truncate_encode()
> which will reduce ->page_len based on how much was read).
>
> Then nfsd4_encode_readv() calls nfsd_iter_read().
>
> I don't think we can use ->page_len to determine if it is safe to use
> nfsd_direct_read().
Agreed.
> The conditions where nfsd_direct_read() is safe are
> similar to the conditions where nfsd_splice_read() is safe. Maybe we
> should use similar code.
That seems to be this bit of logic at the tail of
nfsd4_decode_compound() ?
argp->splice_ok = nfsd_read_splice_ok(argp->rqstp);
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
argp->splice_ok = false;
But it is rather specific to NFSv4 COMPOUND processing.
There is this familiar-looking check in nfsd4_encode_splice_read() :
/*
* Splice read doesn't work if encoding has already wandered
* into the XDR buf's page array.
*/
if (unlikely(xdr->buf->page_len)) {
WARN_ON_ONCE(1);
return nfserr_serverfault;
}
Which probably works fine without the xdr_reserve_space_vec() call,
and might even be unnecessary.
I think nfsd4_encode_readv() needs to perform the check somehow.
The check for NFSv3 is always going to be simpler.
--
Chuck Lever
^ permalink raw reply [flat|nested] 19+ messages in thread