* [PATCH v3 1/5] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
@ 2025-07-14 22:42 ` Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 2/5] NFSD: pass nfsd_file to nfsd_iter_read() Mike Snitzer
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Mike Snitzer @ 2025-07-14 22:42 UTC (permalink / raw)
To: Chuck Lever, Jeff Layton; +Cc: linux-nfs
Use STATX_DIOALIGN and STATX_DIO_READ_ALIGN to get and store DIO
alignment attributes from underlying filesystem in associated
nfsd_file. This is done when the nfsd_file is first opened for
a regular file.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/filecache.c | 32 ++++++++++++++++++++++++++++++++
fs/nfsd/filecache.h | 4 ++++
fs/nfsd/nfsfh.c | 4 ++++
3 files changed, 40 insertions(+)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 68b8d0c6414e..a42cfc23435a 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
refcount_set(&nf->nf_ref, 1);
nf->nf_may = need;
nf->nf_mark = NULL;
+ nf->nf_dio_mem_align = 0;
+ nf->nf_dio_offset_align = 0;
+ nf->nf_dio_read_offset_align = 0;
return nf;
}
@@ -1069,6 +1072,33 @@ nfsd_file_is_cached(struct inode *inode)
return ret;
}
+static __be32
+nfsd_file_getattr(const struct svc_fh *fhp, struct nfsd_file *nf)
+{
+ struct inode *inode = file_inode(nf->nf_file);
+ struct kstat stat;
+ __be32 status;
+
+ /* Currently only need to get DIO alignment info for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return nfs_ok;
+
+ status = fh_getattr(fhp, &stat);
+ if (status != nfs_ok)
+ return status;
+
+ if (stat.result_mask & STATX_DIOALIGN) {
+ nf->nf_dio_mem_align = stat.dio_mem_align;
+ nf->nf_dio_offset_align = stat.dio_offset_align;
+ }
+ if (stat.result_mask & STATX_DIO_READ_ALIGN)
+ nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
+ else
+ nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
+
+ return status;
+}
+
static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
struct svc_cred *cred,
@@ -1187,6 +1217,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
}
status = nfserrno(ret);
trace_nfsd_file_open(nf, status);
+ if (status == nfs_ok)
+ status = nfsd_file_getattr(fhp, nf);
}
} else
status = nfserr_jukebox;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 722b26c71e45..237a05c74211 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -54,6 +54,10 @@ struct nfsd_file {
struct list_head nf_gc;
struct rcu_head nf_rcu;
ktime_t nf_birthtime;
+
+ u32 nf_dio_mem_align;
+ u32 nf_dio_offset_align;
+ u32 nf_dio_read_offset_align;
};
int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f4a3cc9e31e0..bdba2ba828a6 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -677,8 +677,12 @@ __be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat)
.mnt = fhp->fh_export->ex_path.mnt,
.dentry = fhp->fh_dentry,
};
+ struct inode *inode = d_inode(p.dentry);
u32 request_mask = STATX_BASIC_STATS;
+ if (S_ISREG(inode->i_mode))
+ request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN);
+
if (fhp->fh_maxsize == NFS4_FHSIZE)
request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
--
2.44.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v3 2/5] NFSD: pass nfsd_file to nfsd_iter_read()
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 1/5] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
@ 2025-07-14 22:42 ` Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 3/5] NFSD: add io_cache_read controls to debugfs interface Mike Snitzer
` (4 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Mike Snitzer @ 2025-07-14 22:42 UTC (permalink / raw)
To: Chuck Lever, Jeff Layton; +Cc: linux-nfs
Prepares for nfsd_iter_read() to use DIO alignment stored in nfsd_file.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/nfs4xdr.c | 8 ++++----
fs/nfsd/vfs.c | 7 ++++---
fs/nfsd/vfs.h | 2 +-
3 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2ee218ff4958..4e29297c610a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4476,7 +4476,7 @@ static __be32 nfsd4_encode_splice_read(
static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
unsigned int base = xdr->buf->page_len & ~PAGE_MASK;
@@ -4487,7 +4487,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
if (xdr_reserve_space_vec(xdr, maxcount) < 0)
return nfserr_resource;
- nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
+ nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, read->rd_nf,
read->rd_offset, &maxcount, base,
&read->rd_eof);
read->rd_length = maxcount;
@@ -4534,7 +4534,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
if (file->f_op->splice_read && splice_ok)
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
- nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+ nfserr = nfsd4_encode_readv(resp, read, maxcount);
if (nfserr) {
xdr_truncate_encode(xdr, eof_offset);
return nfserr;
@@ -5430,7 +5430,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
if (file->f_op->splice_read && splice_ok)
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
- nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+ nfserr = nfsd4_encode_readv(resp, read, maxcount);
if (nfserr)
return nfserr;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2edf76feaeb9..845c212ad10b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1067,7 +1067,7 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
* nfsd_iter_read - Perform a VFS read using an iterator
* @rqstp: RPC transaction context
* @fhp: file handle of file to be read
- * @file: opened struct file of file to be read
+ * @nf: opened struct nfsd_file of file to be read
* @offset: starting byte offset
* @count: IN: requested number of bytes; OUT: number of bytes read
* @base: offset in first page of read buffer
@@ -1080,9 +1080,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
* returned.
*/
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file *file, loff_t offset, unsigned long *count,
+ struct nfsd_file *nf, loff_t offset, unsigned long *count,
unsigned int base, u32 *eof)
{
+ struct file *file = nf->nf_file;
unsigned long v, total;
struct iov_iter iter;
struct kiocb kiocb;
@@ -1304,7 +1305,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp))
err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
else
- err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
+ err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof);
nfsd_file_put(nf);
trace_nfsd_read_done(rqstp, fhp, offset, *count);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 0c0292611c6d..fa46f8b5f132 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -121,7 +121,7 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned long *count,
u32 *eof);
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file *file, loff_t offset,
+ struct nfsd_file *nf, loff_t offset,
unsigned long *count, unsigned int base,
u32 *eof);
bool nfsd_read_splice_ok(struct svc_rqst *rqstp);
--
2.44.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v3 3/5] NFSD: add io_cache_read controls to debugfs interface
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 1/5] NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 2/5] NFSD: pass nfsd_file to nfsd_iter_read() Mike Snitzer
@ 2025-07-14 22:42 ` Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 4/5] NFSD: add io_cache_write " Mike Snitzer
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Mike Snitzer @ 2025-07-14 22:42 UTC (permalink / raw)
To: Chuck Lever, Jeff Layton; +Cc: linux-nfs
Add 'io_cache_read' to NFSD's debugfs interface so that: Any data
read by NFSD will either be:
- cached using page cache (NFSD_IO_BUFFERED=1)
- cached but removed from the page cache upon completion
(NFSD_IO_DONTCACHE=2).
- not cached (NFSD_IO_DIRECT=3)
io_cache_read may be set by writing to:
/sys/kernel/debug/nfsd/io_cache_read
If NFSD_IO_DONTCACHE is specified using 2, FOP_DONTCACHE must be
advertised as supported by the underlying filesystem (e.g. XFS),
otherwise all IO flagged with RWF_DONTCACHE will fail with
-EOPNOTSUPP.
If NFSD_IO_DIRECT is specified using 3, the IO must be aligned
relative to the underlying block device's logical_block_size. Also the
memory buffer used to store the read must be aligned relative to the
underlying block device's dma_alignment.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/debugfs.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
fs/nfsd/nfsd.h | 9 ++++++++
fs/nfsd/vfs.c | 16 +++++++++++++
3 files changed, 83 insertions(+)
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index 84b0c8b559dc..ad67ccba01ec 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -27,11 +27,66 @@ static int nfsd_dsr_get(void *data, u64 *val)
static int nfsd_dsr_set(void *data, u64 val)
{
nfsd_disable_splice_read = (val > 0) ? true : false;
+ if (!nfsd_disable_splice_read) {
+ /*
+ * Cannot use NFSD_IO_DONTCACHE or NFSD_IO_DIRECT
+ * if splice_read is enabled.
+ */
+ nfsd_io_cache_read = NFSD_IO_BUFFERED;
+ }
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
+/*
+ * /sys/kernel/debug/nfsd/io_cache_read
+ *
+ * Contents:
+ * %1: NFS READ will use buffered IO
+ * %2: NFS READ will use dontcache (buffered IO w/ dropbehind)
+ * %3: NFS READ will use direct IO
+ *
+ * The default value of this setting is zero (UNSPECIFIED).
+ * This setting takes immediate effect for all NFS versions,
+ * all exports, and in all NFSD net namespaces.
+ */
+
+static int nfsd_io_cache_read_get(void *data, u64 *val)
+{
+ *val = nfsd_io_cache_read;
+ return 0;
+}
+
+static int nfsd_io_cache_read_set(void *data, u64 val)
+{
+ int ret = 0;
+
+ switch (val) {
+ case NFSD_IO_BUFFERED:
+ nfsd_io_cache_read = NFSD_IO_BUFFERED;
+ break;
+ case NFSD_IO_DONTCACHE:
+ case NFSD_IO_DIRECT:
+ /*
+ * Must disable splice_read when enabling
+ * NFSD_IO_DONTCACHE or NFSD_IO_DIRECT.
+ */
+ nfsd_disable_splice_read = true;
+ nfsd_io_cache_read = val;
+ break;
+ default:
+ nfsd_io_cache_read = NFSD_IO_UNSPECIFIED;
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_read_fops, nfsd_io_cache_read_get,
+ nfsd_io_cache_read_set, "%llu\n");
+
void nfsd_debugfs_exit(void)
{
debugfs_remove_recursive(nfsd_top_dir);
@@ -44,4 +99,7 @@ void nfsd_debugfs_init(void)
debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
nfsd_top_dir, NULL, &nfsd_dsr_fops);
+
+ debugfs_create_file("io_cache_read", S_IWUSR | S_IRUGO,
+ nfsd_top_dir, NULL, &nfsd_io_cache_read_fops);
}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 1cd0bed57bc2..6ef799405145 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -153,6 +153,15 @@ static inline void nfsd_debugfs_exit(void) {}
extern bool nfsd_disable_splice_read __read_mostly;
+enum {
+ NFSD_IO_UNSPECIFIED = 0,
+ NFSD_IO_BUFFERED,
+ NFSD_IO_DONTCACHE,
+ NFSD_IO_DIRECT,
+};
+
+extern u64 nfsd_io_cache_read __read_mostly;
+
extern int nfsd_max_blksize;
static inline int nfsd_v4client(struct svc_rqst *rq)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 845c212ad10b..2fb8bac358e6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -49,6 +49,7 @@
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
bool nfsd_disable_splice_read __read_mostly;
+u64 nfsd_io_cache_read __read_mostly;
/**
* nfserrno - Map Linux errnos to NFS errnos
@@ -1107,6 +1108,21 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
+
+ switch (nsfd_io_cache_read) {
+ case NFSD_IO_DIRECT:
+ if (nf->nf_dio_mem_align && nf->nf_dio_read_offset_align &&
+ iov_iter_is_aligned(&iter, nf->nf_dio_mem_align - 1,
+ nf->nf_dio_read_offset_align - 1))
+ kiocb.ki_flags = IOCB_DIRECT;
+ break;
+ case NFSD_IO_DONTCACHE:
+ kiocb.ki_flags = IOCB_DONTCACHE;
+ break;
+ case NFSD_IO_BUFFERED:
+ break;
+ }
+
host_err = vfs_iocb_iter_read(file, &kiocb, &iter);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
--
2.44.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v3 4/5] NFSD: add io_cache_write controls to debugfs interface
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
` (2 preceding siblings ...)
2025-07-14 22:42 ` [PATCH v3 3/5] NFSD: add io_cache_read controls to debugfs interface Mike Snitzer
@ 2025-07-14 22:42 ` Mike Snitzer
2025-07-14 22:42 ` [PATCH v3 5/5] NFSD: issue READs using O_DIRECT even if IO is misaligned Mike Snitzer
` (2 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Mike Snitzer @ 2025-07-14 22:42 UTC (permalink / raw)
To: Chuck Lever, Jeff Layton; +Cc: linux-nfs
Add 'io_cache_write' to NFSD's debugfs interface so that: Any data
written by NFSD will either be:
- cached using page cache (NFSD_IO_BUFFERED=1)
- cached but removed from the page cache upon completion
(NFSD_IO_DONTCACHE=2).
- not cached (NFSD_IO_DIRECT=3)
io_cache_write may be set by writing to:
/sys/kernel/debug/nfsd/io_cache_write
If NFSD_IO_DONTCACHE is specified using 2, FOP_DONTCACHE must be
advertised as supported by the underlying filesystem (e.g. XFS),
otherwise all IO flagged with RWF_DONTCACHE will fail with
-EOPNOTSUPP.
If NFSD_IO_DIRECT is specified using 3, the IO must be aligned
relative to the underlying block device's logical_block_size. Also the
memory buffer used to store the WRITE payload must be aligned relative
to the underlying block device's dma_alignment.
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/debugfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
fs/nfsd/nfsd.h | 1 +
fs/nfsd/vfs.c | 18 ++++++++++++++++++
3 files changed, 63 insertions(+)
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index ad67ccba01ec..b8120b0397d7 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -87,6 +87,47 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_read_fops, nfsd_io_cache_read_get,
nfsd_io_cache_read_set, "%llu\n");
+/*
+ * /sys/kernel/debug/nfsd/io_cache_write
+ *
+ * Contents:
+ * %1: NFS WRITE will use buffered IO
+ * %2: NFS WRITE will use dontcache (buffered IO w/ dropbehind)
+ * %3: NFS WRITE will use direct IO
+ *
+ * The default value of this setting is zero (UNSPECIFIED).
+ * This setting takes immediate effect for all NFS versions,
+ * all exports, and in all NFSD net namespaces.
+ */
+
+static int nfsd_io_cache_write_get(void *data, u64 *val)
+{
+ *val = nfsd_io_cache_write;
+ return 0;
+}
+
+static int nfsd_io_cache_write_set(void *data, u64 val)
+{
+ int ret = 0;
+
+ switch (val) {
+ case NFSD_IO_BUFFERED:
+ case NFSD_IO_DONTCACHE:
+ case NFSD_IO_DIRECT:
+ nfsd_io_cache_write = val;
+ break;
+ default:
+ nfsd_io_cache_write = NFSD_IO_UNSPECIFIED;
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_write_fops, nfsd_io_cache_write_get,
+ nfsd_io_cache_write_set, "%llu\n");
+
void nfsd_debugfs_exit(void)
{
debugfs_remove_recursive(nfsd_top_dir);
@@ -102,4 +143,7 @@ void nfsd_debugfs_init(void)
debugfs_create_file("io_cache_read", S_IWUSR | S_IRUGO,
nfsd_top_dir, NULL, &nfsd_io_cache_read_fops);
+
+ debugfs_create_file("io_cache_write", S_IWUSR | S_IRUGO,
+ nfsd_top_dir, NULL, &nfsd_io_cache_write_fops);
}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6ef799405145..fe935b4cda53 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -161,6 +161,7 @@ enum {
};
extern u64 nfsd_io_cache_read __read_mostly;
+extern u64 nfsd_io_cache_write __read_mostly;
extern int nfsd_max_blksize;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2fb8bac358e6..dfeaed3d2d41 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -50,6 +50,7 @@
bool nfsd_disable_splice_read __read_mostly;
u64 nfsd_io_cache_read __read_mostly;
+u64 nfsd_io_cache_write __read_mostly;
/**
* nfserrno - Map Linux errnos to NFS errnos
@@ -1229,6 +1230,23 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+
+ switch (nfsd_io_cache_write) {
+ case NFSD_IO_DIRECT:
+ /* direct I/O must be aligned to device logical sector size */
+ if (nf->nf_dio_mem_align && nf->nf_dio_read_offset_align &&
+ (((offset | *cnt) & (nf->nf_dio_offset_align-1)) == 0) &&
+ iov_iter_is_aligned(&iter, nf->nf_dio_mem_align - 1,
+ nf->nf_dio_offset_align - 1))
+ kiocb.ki_flags = IOCB_DIRECT;
+ break;
+ case NFSD_IO_DONTCACHE:
+ kiocb.ki_flags = IOCB_DONTCACHE;
+ break;
+ case NFSD_IO_BUFFERED:
+ break;
+ }
+
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
--
2.44.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v3 5/5] NFSD: issue READs using O_DIRECT even if IO is misaligned
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
` (3 preceding siblings ...)
2025-07-14 22:42 ` [PATCH v3 4/5] NFSD: add io_cache_write " Mike Snitzer
@ 2025-07-14 22:42 ` Mike Snitzer
2025-07-15 9:24 ` [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Daire Byrne
2025-07-15 13:59 ` Chuck Lever
6 siblings, 0 replies; 13+ messages in thread
From: Mike Snitzer @ 2025-07-14 22:42 UTC (permalink / raw)
To: Chuck Lever, Jeff Layton; +Cc: linux-nfs
If NFSD_IO_DIRECT is used, expand any misaligned READ to the next
DIO-aligned block (on either end of the READ). The expanded READ is
verified to have proper offset/len (logical_block_size) and
dma_alignment checking.
Must allocate and use a bounce-buffer page (called 'start_extra_page')
if/when expanding the misaligned READ requires reading extra partial
page at the start of the READ so that its DIO-aligned. Otherwise that
extra page at the start will make its way back to the NFS client and
corruption will occur. As found, and then this fix of using an extra
page verified, using the 'dt' utility:
dt of=/mnt/share1/dt_a.test passes=1 bs=47008 count=2 \
iotype=sequential pattern=iot onerr=abort oncerr=abort
see: https://github.com/RobinTMiller/dt.git
Any misaligned READ that is less than 32K won't be expanded to be
DIO-aligned (this heuristic just avoids excess work, like allocating
start_extra_page, for smaller IO that can generally already perform
well using buffered IO).
Also add nfsd_read_vector_dio trace event. This combination of
trace events is useful:
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector_dio/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable
Which for this dd command:
dd if=/mnt/share1/test of=/dev/null bs=47008 count=2 iflag=direct
Results in:
nfsd-16580 [001] ..... 5672.403130: nfsd_read_vector_dio: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008 start=0+0 end=47104-96
nfsd-16580 [001] ..... 5672.403131: nfsd_read_vector: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47104
nfsd-16580 [001] ..... 5672.403134: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0x0 bytecount 0xb800
nfsd-16580 [001] ..... 5672.404380: nfsd_read_io_done: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008
nfsd-16580 [001] ..... 5672.404672: nfsd_read_vector_dio: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008 start=46592+416 end=94208-192
nfsd-16580 [001] ..... 5672.404672: nfsd_read_vector: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=46592 len=47616
nfsd-16580 [001] ..... 5672.404673: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0xb600 bytecount 0xba00
nfsd-16580 [001] ..... 5672.405771: nfsd_read_io_done: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008
Suggested-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
fs/nfsd/trace.h | 37 ++++++++
fs/nfsd/vfs.c | 180 ++++++++++++++++++++++++++++++++-----
include/linux/sunrpc/svc.h | 5 +-
3 files changed, 200 insertions(+), 22 deletions(-)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a664fdf1161e..55055482f8a8 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -473,6 +473,43 @@ DEFINE_NFSD_IO_EVENT(write_done);
DEFINE_NFSD_IO_EVENT(commit_start);
DEFINE_NFSD_IO_EVENT(commit_done);
+TRACE_EVENT(nfsd_read_vector_dio,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ u64 offset,
+ u32 len,
+ loff_t start,
+ loff_t start_extra,
+ loff_t end,
+ loff_t end_extra),
+ TP_ARGS(rqstp, fhp, offset, len, start, start_extra, end, end_extra),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(u64, offset)
+ __field(u32, len)
+ __field(loff_t, start)
+ __field(loff_t, start_extra)
+ __field(loff_t, end)
+ __field(loff_t, end_extra)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->start = start;
+ __entry->start_extra = start_extra;
+ __entry->end = end;
+ __entry->end_extra = end_extra;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u start=%llu+%llu end=%llu-%llu",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->len,
+ __entry->start, __entry->start_extra,
+ __entry->end, __entry->end_extra)
+);
+
DECLARE_EVENT_CLASS(nfsd_err_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index dfeaed3d2d41..b79ff5f6e4f1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -19,6 +19,7 @@
#include <linux/splice.h>
#include <linux/falloc.h>
#include <linux/fcntl.h>
+#include <linux/math.h>
#include <linux/namei.h>
#include <linux/delay.h>
#include <linux/fsnotify.h>
@@ -1065,6 +1066,113 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
+struct nfsd_dio_io
+{
+ loff_t start;
+ loff_t end;
+ unsigned long start_extra;
+ unsigned long end_extra;
+ struct page *start_extra_page;
+};
+
+static void init_nfsd_dio_io(struct nfsd_dio_io *dio_io)
+{
+ memset(dio_io, 0, sizeof(*dio_io));
+ dio_io->start_extra_page = NULL;
+}
+
+static bool nfsd_analyze_read_dio(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ unsigned long len, unsigned int base,
+ struct nfsd_dio_io *dio_io)
+{
+ const u32 dio_blocksize = nf->nf_dio_read_offset_align;
+ loff_t orig_end = offset + len;
+
+ if (WARN_ONCE(!nf->nf_dio_mem_align || !nf->nf_dio_read_offset_align,
+ "%s: underlying filesystem has not provided DIO alignment info\n",
+ __func__))
+ return false;
+
+ if ((base & (nf->nf_dio_mem_align-1)) != 0)
+ return false;
+
+ if (WARN_ONCE(dio_blocksize > PAGE_SIZE,
+ "%s: underlying storage's dio_blocksize=%u > PAGE_SIZE=%lu\n",
+ __func__, dio_blocksize, PAGE_SIZE))
+ return false;
+
+ dio_io->start = round_down(offset, dio_blocksize);
+ dio_io->end = round_up(orig_end, dio_blocksize);
+ dio_io->start_extra = offset - dio_io->start;
+ dio_io->end_extra = dio_io->end - orig_end;
+
+ /* don't expand READ for IO less than 32K */
+ if ((dio_io->start_extra || dio_io->end_extra) && (len < (32 << 10))) {
+ init_nfsd_dio_io(dio_io);
+ return false;
+ }
+
+ if (dio_io->start_extra) {
+ dio_io->start_extra_page = alloc_page(GFP_KERNEL);
+ if (WARN_ONCE(dio_io->start_extra_page == NULL,
+ "%s: Unable to allocate start_extra_page\n", __func__)) {
+ init_nfsd_dio_io(dio_io);
+ return false;
+ }
+ }
+
+ /* Show original offset and count, and how it was expanded for DIO */
+ trace_nfsd_read_vector_dio(rqstp, fhp, offset, len,
+ dio_io->start, dio_io->start_extra,
+ dio_io->end, dio_io->end_extra);
+
+ return true;
+}
+
+static ssize_t nfsd_complete_misaligned_read_dio(struct svc_rqst *rqstp,
+ struct nfsd_dio_io *dio_io,
+ ssize_t bytes_read,
+ unsigned long bytes_expected,
+ loff_t *offset,
+ unsigned long *rq_bvec_numpages)
+{
+ ssize_t host_err = bytes_read;
+ loff_t v;
+
+ /* If nfsd_analyze_read_dio() allocated a start_extra_page it must
+ * be removed from rqstp->rq_bvec[] to avoid returning unwanted data.
+ */
+ if (dio_io->start_extra_page) {
+ __free_page(dio_io->start_extra_page);
+ *rq_bvec_numpages -= 1;
+ v = *rq_bvec_numpages;
+ memmove(rqstp->rq_bvec, rqstp->rq_bvec + 1,
+ v * sizeof(struct bio_vec));
+ }
+ /* Eliminate any end_extra bytes from the last page */
+ v = *rq_bvec_numpages;
+ rqstp->rq_bvec[v].bv_len -= dio_io->end_extra;
+
+ if (host_err < 0)
+ return host_err;
+
+ /* nfsd_analyze_read_dio() may have expanded the start and end,
+ * if so adjust returned read size to reflect original extent.
+ */
+ *offset += dio_io->start_extra;
+ if (likely(host_err >= dio_io->start_extra)) {
+ host_err -= dio_io->start_extra;
+ if (host_err > bytes_expected)
+ host_err = bytes_expected;
+ } else {
+ /* Short read that didn't read any of requested data */
+ host_err = 0;
+ }
+
+ return host_err;
+}
+
/**
* nfsd_iter_read - Perform a VFS read using an iterator
* @rqstp: RPC transaction context
@@ -1086,45 +1194,75 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int base, u32 *eof)
{
struct file *file = nf->nf_file;
- unsigned long v, total;
+ unsigned long v, total, in_count = *count;
+ struct nfsd_dio_io dio_io;
struct iov_iter iter;
struct kiocb kiocb;
- ssize_t host_err;
+ ssize_t host_err = 0;
size_t len;
+ init_nfsd_dio_io(&dio_io);
init_sync_kiocb(&kiocb, file);
+
+ /*
+ * If NFSD_IO_DIRECT enabled, expand any misaligned READ to
+ * the next DIO-aligned block (on either end of the READ).
+ */
+ if (nfsd_io_cache_read == NFSD_IO_DIRECT) {
+ if (nfsd_analyze_read_dio(rqstp, fhp, nf, offset,
+ in_count, base, &dio_io)) {
+ /* trace_nfsd_read_vector() will reflect larger
+ * DIO-aligned READ.
+ */
+ offset = dio_io.start;
+ in_count = dio_io.end - offset;
+ kiocb.ki_flags = IOCB_DIRECT;
+ }
+ } else if (nfsd_io_cache_read == NFSD_IO_DONTCACHE)
+ kiocb.ki_flags = IOCB_DONTCACHE;
+
kiocb.ki_pos = offset;
v = 0;
- total = *count;
+ total = in_count;
+ if (dio_io.start_extra) {
+ bvec_set_page(&rqstp->rq_bvec[v++], dio_io.start_extra_page,
+ dio_io.start_extra, PAGE_SIZE - dio_io.start_extra);
+ total -= dio_io.start_extra;
+ }
while (total) {
len = min_t(size_t, total, PAGE_SIZE - base);
- bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
+ bvec_set_page(&rqstp->rq_bvec[v++], *(rqstp->rq_next_page++),
len, base);
total -= len;
- ++v;
base = 0;
}
- WARN_ON_ONCE(v > rqstp->rq_maxpages);
+ if (WARN_ONCE(v > rqstp->rq_maxpages,
+ "%s: v=%lu exceeds rqstp->rq_maxpages=%lu\n", __func__,
+ v, rqstp->rq_maxpages)) {
+ host_err = -EINVAL;
+ }
+
+ if (!host_err) {
+ trace_nfsd_read_vector(rqstp, fhp, offset, in_count);
+ iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, in_count);
- trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
+ /* Double check nfsd_analyze_read_dio's DIO-aligned result */
+ if (unlikely((kiocb.ki_flags & IOCB_DIRECT) &&
+ !iov_iter_is_aligned(&iter,
+ nf->nf_dio_mem_align - 1,
+ nf->nf_dio_read_offset_align - 1))) {
+ /* Fallback to buffered IO */
+ kiocb.ki_flags &= ~IOCB_DIRECT;
+ }
- switch (nsfd_io_cache_read) {
- case NFSD_IO_DIRECT:
- if (nf->nf_dio_mem_align && nf->nf_dio_read_offset_align &&
- iov_iter_is_aligned(&iter, nf->nf_dio_mem_align - 1,
- nf->nf_dio_read_offset_align - 1))
- kiocb.ki_flags = IOCB_DIRECT;
- break;
- case NFSD_IO_DONTCACHE:
- kiocb.ki_flags = IOCB_DONTCACHE;
- break;
- case NFSD_IO_BUFFERED:
- break;
+ host_err = vfs_iocb_iter_read(file, &kiocb, &iter);
}
- host_err = vfs_iocb_iter_read(file, &kiocb, &iter);
+ if (dio_io.start_extra || dio_io.end_extra) {
+ host_err = nfsd_complete_misaligned_read_dio(rqstp, &dio_io,
+ host_err, *count, &offset, &v);
+ }
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e64ab444e0a7..190c2667500e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -163,10 +163,13 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
* pages, one for the request, and one for the reply.
* nfsd_splice_actor() might need an extra page when a READ payload
* is not page-aligned.
+ * nfsd_iter_read() might need two extra pages when a READ payload
+ * is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor()
+ * are mutually exclusive (so reuse page reserved for nfsd_splice_actor).
*/
static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv)
{
- return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1;
+ return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1 + 1;
}
/*
--
2.44.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
` (4 preceding siblings ...)
2025-07-14 22:42 ` [PATCH v3 5/5] NFSD: issue READs using O_DIRECT even if IO is misaligned Mike Snitzer
@ 2025-07-15 9:24 ` Daire Byrne
2025-07-15 11:28 ` Jeff Layton
2025-07-15 13:31 ` Chuck Lever
2025-07-15 13:59 ` Chuck Lever
6 siblings, 2 replies; 13+ messages in thread
From: Daire Byrne @ 2025-07-15 9:24 UTC (permalink / raw)
To: Mike Snitzer; +Cc: Chuck Lever, Jeff Layton, linux-nfs
Just a quick note to say that we are one of the examples (batch render
farm) where we rely on the NFSD pagecache a lot.
We have read heavy workloads where many clients share much of the same
input data (e.g. rendering sequential frames).
In fact, our 2 x 100gbit servers have 3TB of RAM and serve 70% of all
reads from nfsd pagecache. It is not uncommon to max out the 200gbit
network in this way even with spinning rust storage.
Anyway, as you were.
Daire
On Mon, 14 Jul 2025 at 23:42, Mike Snitzer <snitzer@kernel.org> wrote:
>
> Hi,
>
> Summary (by Jeff Layton [0]):
> "The basic problem is that the pagecache is pretty useless for
> satisfying READs from nfsd. Most NFS workloads don't involve I/O to
> the same files from multiple clients. The client ends up having most
> of the data in its cache already and only very rarely do we need to
> revisit the data on the server.
>
> At the same time, it's really easy to overwhelm the storage with
> pagecache writeback with modern memory sizes. Having nfsd bypass the
> pagecache altogether is potentially a huge performance win, if it can
> be made to work safely."
>
> The performance win associated with using NFSD DIRECT was previously
> summarized here:
> https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/
> This picture offers a nice summary of performance gains:
> https://original.art/NFSD_direct_vs_buffered_IO.jpg
>
> This v3 series was developed ontop of Chuck's nfsd_testing which has 2
> patches that saw fh_getattr() moved, etc (v2 of this series included
> those patches but since they got review during v2 and Chuck already
> has them staged in nfsd-testing I didn't think it made sense to keep
> them included in this v3).
>
> Changes since v2 include:
> - explored suggestion to use string based interface (e.g. "direct"
> instead of 3) but debugfs seems to only supports numeric values.
> - shifted numeric values for debugfs interface from 0-2 to 1-3 and
> made 0 UNSPECIFIED (which is the default)
> - if user specifies io_cache_read or io_cache_write mode other than 1,
> 2 or 3 (via debugfs) they will get an error message
> - pass a data structure to nfsd_analyze_read_dio rather than so many
> in/out params
> - improved comments as requested (e.g. "Must remove first
> start_extra_page from rqstp->rq_bvec" was reworked)
> - use memmove instead of opencoded shift in
> nfsd_complete_misaligned_read_dio
> - dropped the still very important "lib/iov_iter: remove piecewise
> bvec length checking in iov_iter_aligned_bvec" patch because it
> needs to be handled separately.
> - various other changes to improve code
>
> Thanks,
> Mike
>
> [0]: https://lore.kernel.org/linux-nfs/b1accdad470f19614f9d3865bb3a4c69958e5800.camel@kernel.org/
>
> Mike Snitzer (5):
> NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
> NFSD: pass nfsd_file to nfsd_iter_read()
> NFSD: add io_cache_read controls to debugfs interface
> NFSD: add io_cache_write controls to debugfs interface
> NFSD: issue READs using O_DIRECT even if IO is misaligned
>
> fs/nfsd/debugfs.c | 102 +++++++++++++++++++
> fs/nfsd/filecache.c | 32 ++++++
> fs/nfsd/filecache.h | 4 +
> fs/nfsd/nfs4xdr.c | 8 +-
> fs/nfsd/nfsd.h | 10 ++
> fs/nfsd/nfsfh.c | 4 +
> fs/nfsd/trace.h | 37 +++++++
> fs/nfsd/vfs.c | 197 ++++++++++++++++++++++++++++++++++---
> fs/nfsd/vfs.h | 2 +-
> include/linux/sunrpc/svc.h | 5 +-
> 10 files changed, 383 insertions(+), 18 deletions(-)
>
> --
> 2.44.0
>
>
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-15 9:24 ` [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Daire Byrne
@ 2025-07-15 11:28 ` Jeff Layton
2025-07-15 13:31 ` Chuck Lever
1 sibling, 0 replies; 13+ messages in thread
From: Jeff Layton @ 2025-07-15 11:28 UTC (permalink / raw)
To: Daire Byrne, Mike Snitzer; +Cc: Chuck Lever, linux-nfs
Understood. We're not looking to abandon you guys. I think bog-standard
buffered I/O will be the default option for the forseeable future. We
are pretty keen to add other I/O modes as an _option_, however because
they do help other important workloads.
The hard part is how we make this tunable without shooting our future
selves in our collective feet. That's the main reason this is all being
done in debugfs for the moment, since that carries no ABI guarantees.
-- Jeff
On Tue, 2025-07-15 at 10:24 +0100, Daire Byrne wrote:
> Just a quick note to say that we are one of the examples (batch render
> farm) where we rely on the NFSD pagecache a lot.
>
> We have read heavy workloads where many clients share much of the same
> input data (e.g. rendering sequential frames).
>
> In fact, our 2 x 100gbit servers have 3TB of RAM and serve 70% of all
> reads from nfsd pagecache. It is not uncommon to max out the 200gbit
> network in this way even with spinning rust storage.
>
> Anyway, as you were.
>
> Daire
>
> On Mon, 14 Jul 2025 at 23:42, Mike Snitzer <snitzer@kernel.org> wrote:
> >
> > Hi,
> >
> > Summary (by Jeff Layton [0]):
> > "The basic problem is that the pagecache is pretty useless for
> > satisfying READs from nfsd. Most NFS workloads don't involve I/O to
> > the same files from multiple clients. The client ends up having most
> > of the data in its cache already and only very rarely do we need to
> > revisit the data on the server.
> >
> > At the same time, it's really easy to overwhelm the storage with
> > pagecache writeback with modern memory sizes. Having nfsd bypass the
> > pagecache altogether is potentially a huge performance win, if it can
> > be made to work safely."
> >
> > The performance win associated with using NFSD DIRECT was previously
> > summarized here:
> > https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/
> > This picture offers a nice summary of performance gains:
> > https://original.art/NFSD_direct_vs_buffered_IO.jpg
> >
> > This v3 series was developed ontop of Chuck's nfsd_testing which has 2
> > patches that saw fh_getattr() moved, etc (v2 of this series included
> > those patches but since they got review during v2 and Chuck already
> > has them staged in nfsd-testing I didn't think it made sense to keep
> > them included in this v3).
> >
> > Changes since v2 include:
> > - explored suggestion to use string based interface (e.g. "direct"
> > instead of 3) but debugfs seems to only supports numeric values.
> > - shifted numeric values for debugfs interface from 0-2 to 1-3 and
> > made 0 UNSPECIFIED (which is the default)
> > - if user specifies io_cache_read or io_cache_write mode other than 1,
> > 2 or 3 (via debugfs) they will get an error message
> > - pass a data structure to nfsd_analyze_read_dio rather than so many
> > in/out params
> > - improved comments as requested (e.g. "Must remove first
> > start_extra_page from rqstp->rq_bvec" was reworked)
> > - use memmove instead of opencoded shift in
> > nfsd_complete_misaligned_read_dio
> > - dropped the still very important "lib/iov_iter: remove piecewise
> > bvec length checking in iov_iter_aligned_bvec" patch because it
> > needs to be handled separately.
> > - various other changes to improve code
> >
> > Thanks,
> > Mike
> >
> > [0]: https://lore.kernel.org/linux-nfs/b1accdad470f19614f9d3865bb3a4c69958e5800.camel@kernel.org/
> >
> > Mike Snitzer (5):
> > NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
> > NFSD: pass nfsd_file to nfsd_iter_read()
> > NFSD: add io_cache_read controls to debugfs interface
> > NFSD: add io_cache_write controls to debugfs interface
> > NFSD: issue READs using O_DIRECT even if IO is misaligned
> >
> > fs/nfsd/debugfs.c | 102 +++++++++++++++++++
> > fs/nfsd/filecache.c | 32 ++++++
> > fs/nfsd/filecache.h | 4 +
> > fs/nfsd/nfs4xdr.c | 8 +-
> > fs/nfsd/nfsd.h | 10 ++
> > fs/nfsd/nfsfh.c | 4 +
> > fs/nfsd/trace.h | 37 +++++++
> > fs/nfsd/vfs.c | 197 ++++++++++++++++++++++++++++++++++---
> > fs/nfsd/vfs.h | 2 +-
> > include/linux/sunrpc/svc.h | 5 +-
> > 10 files changed, 383 insertions(+), 18 deletions(-)
> >
> > --
> > 2.44.0
> >
> >
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-15 9:24 ` [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Daire Byrne
2025-07-15 11:28 ` Jeff Layton
@ 2025-07-15 13:31 ` Chuck Lever
2025-07-16 10:28 ` Daire Byrne
1 sibling, 1 reply; 13+ messages in thread
From: Chuck Lever @ 2025-07-15 13:31 UTC (permalink / raw)
To: Daire Byrne; +Cc: Jeff Layton, linux-nfs, Mike Snitzer
On 7/15/25 5:24 AM, Daire Byrne wrote:
> Just a quick note to say that we are one of the examples (batch render
> farm) where we rely on the NFSD pagecache a lot.
The new O_DIRECT style READs depend on the cache in the underlying block
devices to keep READs fast. So, there is still some caching happening
on the NFS server in this mode.
> We have read heavy workloads where many clients share much of the same
> input data (e.g. rendering sequential frames).
>
> In fact, our 2 x 100gbit servers have 3TB of RAM and serve 70% of all
> reads from nfsd pagecache. It is not uncommon to max out the 200gbit
> network in this way even with spinning rust storage.
Can you tell us what persistent storage underlies your data sets? Are
the hard drives in a hardware or software RAID, for example?
Note that Mike's features are enabled via a debugfs switch -- this is
because they are experimental for the moment. The default setting is
to continue using the server's page cache.
--
Chuck Lever
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-15 13:31 ` Chuck Lever
@ 2025-07-16 10:28 ` Daire Byrne
0 siblings, 0 replies; 13+ messages in thread
From: Daire Byrne @ 2025-07-16 10:28 UTC (permalink / raw)
To: Chuck Lever; +Cc: Jeff Layton, linux-nfs, Mike Snitzer
On Tue, 15 Jul 2025 at 14:31, Chuck Lever <chuck.lever@oracle.com> wrote:
>
> On 7/15/25 5:24 AM, Daire Byrne wrote:
> > Just a quick note to say that we are one of the examples (batch render
> > farm) where we rely on the NFSD pagecache a lot.
>
> The new O_DIRECT style READs depend on the cache in the underlying block
> devices to keep READs fast. So, there is still some caching happening
> on the NFS server in this mode.
Ah right, of course. I wonder how much we actually use nfsd pagecache
versus the block device pagecache then...
> > We have read heavy workloads where many clients share much of the same
> > input data (e.g. rendering sequential frames).
> >
> > In fact, our 2 x 100gbit servers have 3TB of RAM and serve 70% of all
> > reads from nfsd pagecache. It is not uncommon to max out the 200gbit
> > network in this way even with spinning rust storage.
>
> Can you tell us what persistent storage underlies your data sets? Are
> the hard drives in a hardware or software RAID, for example?
Generally SAS attached external RAID arrays. We often use another
smaller NVMe layer too (dm-cache or opencas) in front of it (LVM +
XFS).
But really, it's the 3TB of RAM per server (1PB disk) that does most
of our heavy lifting. Our read/write ratio is something like 5:1 and
we have a pretty aggressive/short writeback cache (to minimise long
write backlogs). Looking forward to multi-threaded writeback to see
how that helps us.
> Note that Mike's features are enabled via a debugfs switch -- this is
> because they are experimental for the moment. The default setting is
> to continue using the server's page cache.
Yep, all good. Like you said, it may be that we are more reliant on
the block device cache anyway.
Cheers,
Daire
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-14 22:42 [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Mike Snitzer
` (5 preceding siblings ...)
2025-07-15 9:24 ` [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes Daire Byrne
@ 2025-07-15 13:59 ` Chuck Lever
2025-07-15 14:50 ` Mike Snitzer
6 siblings, 1 reply; 13+ messages in thread
From: Chuck Lever @ 2025-07-15 13:59 UTC (permalink / raw)
To: Mike Snitzer; +Cc: linux-nfs, Jeff Layton
Hi Mike,
There are a lot of speculative claims here. I would prefer that the
motivation for this work focus on the workload that is actually
suffering from the added layer of cache, rather than making some
claim that "hey, this change is good for all taxpayers!" ;-)
On 7/14/25 6:42 PM, Mike Snitzer wrote:
> Hi,
>
> Summary (by Jeff Layton [0]):
> "The basic problem is that the pagecache is pretty useless for
> satisfying READs from nfsd.
A bold claim like this needs to be backed up with careful benchmark
results.
But really, the actual problem that you are trying to address is that,
for /your/ workloads, the server's page cache is not useful and can be
counter productive when the server's working set is larger than its RAM.
So, I would replace this sentence.
> Most NFS workloads don't involve I/O to
> the same files from multiple clients. The client ends up having most
> of the data in its cache already and only very rarely do we need to
> revisit the data on the server.
Maybe it would be better to say:
"Common NFS workloads do not involve shared files, and client working
sets can comfortably fit in each client's page cache."
And then add a description of the workload you are trying to optimize.
> At the same time, it's really easy to overwhelm the storage with
> pagecache writeback with modern memory sizes.
Again, perhaps this isn't quite accurate? The problem is not only the
server's memory size; it's that the server doesn't start writeback soon
enough, writes back without parallelism, and does not handle thrashing
very well. This is very likely due to the traditional Linux design
that makes writeback lazy (in the computer science sense of "lazy"),
assuming that if the working set does not fit in memory, then you should
simply purchase more RAM.
> Having nfsd bypass the
> pagecache altogether is potentially a huge performance win, if it can
> be made to work safely."
Then finally, "Therefore, we provide the option to make I/O avoid the
NFS server's page cache, as an experiment." Which I hope is somewhat
less alarming to folks who still rely on the server's page cache.
> The performance win associated with using NFSD DIRECT was previously
> summarized here:
> https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/
> This picture offers a nice summary of performance gains:
> https://original.art/NFSD_direct_vs_buffered_IO.jpg
>
> This v3 series was developed ontop of Chuck's nfsd_testing which has 2
> patches that saw fh_getattr() moved, etc (v2 of this series included
> those patches but since they got review during v2 and Chuck already
> has them staged in nfsd-testing I didn't think it made sense to keep
> them included in this v3).
>
> Changes since v2 include:
> - explored suggestion to use string based interface (e.g. "direct"
> instead of 3) but debugfs seems to only supports numeric values.
> - shifted numeric values for debugfs interface from 0-2 to 1-3 and
> made 0 UNSPECIFIED (which is the default)
> - if user specifies io_cache_read or io_cache_write mode other than 1,
> 2 or 3 (via debugfs) they will get an error message
> - pass a data structure to nfsd_analyze_read_dio rather than so many
> in/out params
> - improved comments as requested (e.g. "Must remove first
> start_extra_page from rqstp->rq_bvec" was reworked)
> - use memmove instead of opencoded shift in
> nfsd_complete_misaligned_read_dio
> - dropped the still very important "lib/iov_iter: remove piecewise
> bvec length checking in iov_iter_aligned_bvec" patch because it
> needs to be handled separately.
> - various other changes to improve code
>
> Thanks,
> Mike
>
> [0]: https://lore.kernel.org/linux-nfs/b1accdad470f19614f9d3865bb3a4c69958e5800.camel@kernel.org/
>
> Mike Snitzer (5):
> NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
> NFSD: pass nfsd_file to nfsd_iter_read()
> NFSD: add io_cache_read controls to debugfs interface
> NFSD: add io_cache_write controls to debugfs interface
> NFSD: issue READs using O_DIRECT even if IO is misaligned
>
> fs/nfsd/debugfs.c | 102 +++++++++++++++++++
> fs/nfsd/filecache.c | 32 ++++++
> fs/nfsd/filecache.h | 4 +
> fs/nfsd/nfs4xdr.c | 8 +-
> fs/nfsd/nfsd.h | 10 ++
> fs/nfsd/nfsfh.c | 4 +
> fs/nfsd/trace.h | 37 +++++++
> fs/nfsd/vfs.c | 197 ++++++++++++++++++++++++++++++++++---
> fs/nfsd/vfs.h | 2 +-
> include/linux/sunrpc/svc.h | 5 +-
> 10 files changed, 383 insertions(+), 18 deletions(-)
>
The series is beginning to look clean to me, and we have introduced
several simple but effective clean-ups along the way.
My only concern is that we're making the read path more complex rather
than less. (This isn't a new concern; I have wanted to make reads
simpler by, say, removing splice support, for quite a while, as you
know). I'm hoping that, once the experiment has "concluded", we find
ways of simplifying the code and the administrative interface. (That
is not an objection. call it a Future Work comment).
Also, a remaining open question is how we want to deal with READ_PLUS
and holes.
--
Chuck Lever
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-15 13:59 ` Chuck Lever
@ 2025-07-15 14:50 ` Mike Snitzer
2025-07-15 15:59 ` Chuck Lever
0 siblings, 1 reply; 13+ messages in thread
From: Mike Snitzer @ 2025-07-15 14:50 UTC (permalink / raw)
To: Chuck Lever; +Cc: linux-nfs, Jeff Layton
On Tue, Jul 15, 2025 at 09:59:05AM -0400, Chuck Lever wrote:
> Hi Mike,
>
> There are a lot of speculative claims here. I would prefer that the
> motivation for this work focus on the workload that is actually
> suffering from the added layer of cache, rather than making some
> claim that "hey, this change is good for all taxpayers!" ;-)
Really not sure what you're referring to. I didn't make any
speculative claims...
> On 7/14/25 6:42 PM, Mike Snitzer wrote:
> > Hi,
> >
> > Summary (by Jeff Layton [0]):
> > "The basic problem is that the pagecache is pretty useless for
> > satisfying READs from nfsd.
>
> A bold claim like this needs to be backed up with careful benchmark
> results.
>
> But really, the actual problem that you are trying to address is that,
> for /your/ workloads, the server's page cache is not useful and can be
> counter productive when the server's working set is larger than its RAM.
>
> So, I would replace this sentence.
Oh, you are referring to Jeff's previous summary. Noted! ;)
> > Most NFS workloads don't involve I/O to
> > the same files from multiple clients. The client ends up having most
> > of the data in its cache already and only very rarely do we need to
> > revisit the data on the server.
>
> Maybe it would be better to say:
>
> "Common NFS workloads do not involve shared files, and client working
> sets can comfortably fit in each client's page cache."
>
> And then add a description of the workload you are trying to optimize.
Sure, certainly can/will do for v4 (if/when v4 needed).
> > At the same time, it's really easy to overwhelm the storage with
> > pagecache writeback with modern memory sizes.
>
> Again, perhaps this isn't quite accurate? The problem is not only the
> server's memory size; it's that the server doesn't start writeback soon
> enough, writes back without parallelism, and does not handle thrashing
> very well. This is very likely due to the traditional Linux design
> that makes writeback lazy (in the computer science sense of "lazy"),
> assuming that if the working set does not fit in memory, then you should
> simply purchase more RAM.
>
>
> > Having nfsd bypass the
> > pagecache altogether is potentially a huge performance win, if it can
> > be made to work safely."
>
> Then finally, "Therefore, we provide the option to make I/O avoid the
> NFS server's page cache, as an experiment." Which I hope is somewhat
> less alarming to folks who still rely on the server's page cache.
I can tighten it up respecting/including your feedback. 0th patch
header aside, are you wanting this included somewhere in Documentation?
(if it were to be part of Documentation you'd then be welcome to
refine it as you see needed, but I can take a stab at laying down a
starting point)
> > The performance win associated with using NFSD DIRECT was previously
> > summarized here:
> > https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/
> > This picture offers a nice summary of performance gains:
> > https://original.art/NFSD_direct_vs_buffered_IO.jpg
> >
> > This v3 series was developed ontop of Chuck's nfsd_testing which has 2
> > patches that saw fh_getattr() moved, etc (v2 of this series included
> > those patches but since they got review during v2 and Chuck already
> > has them staged in nfsd-testing I didn't think it made sense to keep
> > them included in this v3).
> >
> > Changes since v2 include:
> > - explored suggestion to use string based interface (e.g. "direct"
> > instead of 3) but debugfs seems to only supports numeric values.
> > - shifted numeric values for debugfs interface from 0-2 to 1-3 and
> > made 0 UNSPECIFIED (which is the default)
> > - if user specifies io_cache_read or io_cache_write mode other than 1,
> > 2 or 3 (via debugfs) they will get an error message
> > - pass a data structure to nfsd_analyze_read_dio rather than so many
> > in/out params
> > - improved comments as requested (e.g. "Must remove first
> > start_extra_page from rqstp->rq_bvec" was reworked)
> > - use memmove instead of opencoded shift in
> > nfsd_complete_misaligned_read_dio
> > - dropped the still very important "lib/iov_iter: remove piecewise
> > bvec length checking in iov_iter_aligned_bvec" patch because it
> > needs to be handled separately.
> > - various other changes to improve code
> >
> > Thanks,
> > Mike
> >
> > [0]: https://lore.kernel.org/linux-nfs/b1accdad470f19614f9d3865bb3a4c69958e5800.camel@kernel.org/
> >
> > Mike Snitzer (5):
> > NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
> > NFSD: pass nfsd_file to nfsd_iter_read()
> > NFSD: add io_cache_read controls to debugfs interface
> > NFSD: add io_cache_write controls to debugfs interface
> > NFSD: issue READs using O_DIRECT even if IO is misaligned
> >
> > fs/nfsd/debugfs.c | 102 +++++++++++++++++++
> > fs/nfsd/filecache.c | 32 ++++++
> > fs/nfsd/filecache.h | 4 +
> > fs/nfsd/nfs4xdr.c | 8 +-
> > fs/nfsd/nfsd.h | 10 ++
> > fs/nfsd/nfsfh.c | 4 +
> > fs/nfsd/trace.h | 37 +++++++
> > fs/nfsd/vfs.c | 197 ++++++++++++++++++++++++++++++++++---
> > fs/nfsd/vfs.h | 2 +-
> > include/linux/sunrpc/svc.h | 5 +-
> > 10 files changed, 383 insertions(+), 18 deletions(-)
> >
>
> The series is beginning to look clean to me, and we have introduced
> several simple but effective clean-ups along the way.
Thanks.
> My only concern is that we're making the read path more complex rather
> than less. (This isn't a new concern; I have wanted to make reads
> simpler by, say, removing splice support, for quite a while, as you
> know). I'm hoping that, once the experiment has "concluded", we find
> ways of simplifying the code and the administrative interface. (That
> is not an objection. call it a Future Work comment).
Yeah, READ path does get more complex but less so than before my
having factored code out to a couple methods... open to any cleanup
suggestions to run with as "Future Work". I think the pivot from
debugfs to per-export controls will be perfect opportunity to polish.
> Also, a remaining open question is how we want to deal with READ_PLUS
> and holes.
Hmm, not familiar with this.. I'll have a look. But if you have
anything further on this point please share.
Thanks,
Mike
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 0/5] NFSD: add "NFSD DIRECT" and "NFSD DONTCACHE" IO modes
2025-07-15 14:50 ` Mike Snitzer
@ 2025-07-15 15:59 ` Chuck Lever
0 siblings, 0 replies; 13+ messages in thread
From: Chuck Lever @ 2025-07-15 15:59 UTC (permalink / raw)
To: Mike Snitzer; +Cc: linux-nfs, Jeff Layton
On 7/15/25 10:50 AM, Mike Snitzer wrote:
> On Tue, Jul 15, 2025 at 09:59:05AM -0400, Chuck Lever wrote:
>> Hi Mike,
>>
>> There are a lot of speculative claims here. I would prefer that the
>> motivation for this work focus on the workload that is actually
>> suffering from the added layer of cache, rather than making some
>> claim that "hey, this change is good for all taxpayers!" ;-)
>
> Really not sure what you're referring to. I didn't make any
> speculative claims...
>
>> On 7/14/25 6:42 PM, Mike Snitzer wrote:
>>> Hi,
>>>
>>> Summary (by Jeff Layton [0]):
>>> "The basic problem is that the pagecache is pretty useless for
>>> satisfying READs from nfsd.
>>
>> A bold claim like this needs to be backed up with careful benchmark
>> results.
>>
>> But really, the actual problem that you are trying to address is that,
>> for /your/ workloads, the server's page cache is not useful and can be
>> counter productive when the server's working set is larger than its RAM.
>>
>> So, I would replace this sentence.
>
> Oh, you are referring to Jeff's previous summary. Noted! ;)
>
>>> Most NFS workloads don't involve I/O to
>>> the same files from multiple clients. The client ends up having most
>>> of the data in its cache already and only very rarely do we need to
>>> revisit the data on the server.
>>
>> Maybe it would be better to say:
>>
>> "Common NFS workloads do not involve shared files, and client working
>> sets can comfortably fit in each client's page cache."
>>
>> And then add a description of the workload you are trying to optimize.
>
> Sure, certainly can/will do for v4 (if/when v4 needed).
>
>>> At the same time, it's really easy to overwhelm the storage with
>>> pagecache writeback with modern memory sizes.
>>
>> Again, perhaps this isn't quite accurate? The problem is not only the
>> server's memory size; it's that the server doesn't start writeback soon
>> enough, writes back without parallelism, and does not handle thrashing
>> very well. This is very likely due to the traditional Linux design
>> that makes writeback lazy (in the computer science sense of "lazy"),
>> assuming that if the working set does not fit in memory, then you should
>> simply purchase more RAM.
>>
>>
>>> Having nfsd bypass the
>>> pagecache altogether is potentially a huge performance win, if it can
>>> be made to work safely."
>>
>> Then finally, "Therefore, we provide the option to make I/O avoid the
>> NFS server's page cache, as an experiment." Which I hope is somewhat
>> less alarming to folks who still rely on the server's page cache.
>
> I can tighten it up respecting/including your feedback. 0th patch
> header aside, are you wanting this included somewhere in Documentation?
Nothing in a fixed Documentation file, at least until we start nailing
down the new per-export administrative interfaces.
> (if it were to be part of Documentation you'd then be welcome to
> refine it as you see needed, but I can take a stab at laying down a
> starting point)
You are in full control of the cover letter, of course. I wanted to
point out where I thought the purpose of this work might differ a little
from what is advertised in this cover letter, which is currently the
only record of the rationale for the series.
>>> The performance win associated with using NFSD DIRECT was previously
>>> summarized here:
>>> https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/
>>> This picture offers a nice summary of performance gains:
>>> https://original.art/NFSD_direct_vs_buffered_IO.jpg
>>>
>>> This v3 series was developed ontop of Chuck's nfsd_testing which has 2
>>> patches that saw fh_getattr() moved, etc (v2 of this series included
>>> those patches but since they got review during v2 and Chuck already
>>> has them staged in nfsd-testing I didn't think it made sense to keep
>>> them included in this v3).
>>>
>>> Changes since v2 include:
>>> - explored suggestion to use string based interface (e.g. "direct"
>>> instead of 3) but debugfs seems to only supports numeric values.
>>> - shifted numeric values for debugfs interface from 0-2 to 1-3 and
>>> made 0 UNSPECIFIED (which is the default)
>>> - if user specifies io_cache_read or io_cache_write mode other than 1,
>>> 2 or 3 (via debugfs) they will get an error message
>>> - pass a data structure to nfsd_analyze_read_dio rather than so many
>>> in/out params
>>> - improved comments as requested (e.g. "Must remove first
>>> start_extra_page from rqstp->rq_bvec" was reworked)
>>> - use memmove instead of opencoded shift in
>>> nfsd_complete_misaligned_read_dio
>>> - dropped the still very important "lib/iov_iter: remove piecewise
>>> bvec length checking in iov_iter_aligned_bvec" patch because it
>>> needs to be handled separately.
>>> - various other changes to improve code
>>>
>>> Thanks,
>>> Mike
>>>
>>> [0]: https://lore.kernel.org/linux-nfs/b1accdad470f19614f9d3865bb3a4c69958e5800.camel@kernel.org/
>>>
>>> Mike Snitzer (5):
>>> NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support
>>> NFSD: pass nfsd_file to nfsd_iter_read()
>>> NFSD: add io_cache_read controls to debugfs interface
>>> NFSD: add io_cache_write controls to debugfs interface
>>> NFSD: issue READs using O_DIRECT even if IO is misaligned
>>>
>>> fs/nfsd/debugfs.c | 102 +++++++++++++++++++
>>> fs/nfsd/filecache.c | 32 ++++++
>>> fs/nfsd/filecache.h | 4 +
>>> fs/nfsd/nfs4xdr.c | 8 +-
>>> fs/nfsd/nfsd.h | 10 ++
>>> fs/nfsd/nfsfh.c | 4 +
>>> fs/nfsd/trace.h | 37 +++++++
>>> fs/nfsd/vfs.c | 197 ++++++++++++++++++++++++++++++++++---
>>> fs/nfsd/vfs.h | 2 +-
>>> include/linux/sunrpc/svc.h | 5 +-
>>> 10 files changed, 383 insertions(+), 18 deletions(-)
>>>
>>
>> The series is beginning to look clean to me, and we have introduced
>> several simple but effective clean-ups along the way.
>
> Thanks.
>
>> My only concern is that we're making the read path more complex rather
>> than less. (This isn't a new concern; I have wanted to make reads
>> simpler by, say, removing splice support, for quite a while, as you
>> know). I'm hoping that, once the experiment has "concluded", we find
>> ways of simplifying the code and the administrative interface. (That
>> is not an objection. call it a Future Work comment).
>
> Yeah, READ path does get more complex but less so than before my
> having factored code out to a couple methods... open to any cleanup
> suggestions to run with as "Future Work". I think the pivot from
> debugfs to per-export controls will be perfect opportunity to polish.
>
>> Also, a remaining open question is how we want to deal with READ_PLUS
>> and holes.
>
> Hmm, not familiar with this.. I'll have a look. But if you have
> anything further on this point please share.
Currently I don't think we need to deal with it in this patch set. But
note that NFSv4.2 READ_PLUS can return a map of unallocated areas in a
file. We should think a little about whether additional logic is needed
when using O_DIRECT READs.
--
Chuck Lever
^ permalink raw reply [flat|nested] 13+ messages in thread