* [PATCH] nfsd: Implement large extent array support in pNFS @ 2025-06-04 13:07 Sergey Bashirov 2025-06-04 14:10 ` Chuck Lever 2025-06-04 14:54 ` Christoph Hellwig 0 siblings, 2 replies; 11+ messages in thread From: Sergey Bashirov @ 2025-06-04 13:07 UTC (permalink / raw) To: J . Bruce Fields, Chuck Lever Cc: linux-nfs, Sergey Bashirov, Konstantin Evtushenko When pNFS client in block layout mode sends layoutcommit RPC to MDS, a variable length array of modified extents is supplied within request. This patch allows NFS server to accept such extent arrays if they do not fit within single memory page. Co-developed-by: Konstantin Evtushenko <koevtushenko@yandex.com> Signed-off-by: Konstantin Evtushenko <koevtushenko@yandex.com> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com> --- fs/nfsd/blocklayout.c | 12 ++++--- fs/nfsd/blocklayoutxdr.c | 78 ++++++++++++++++++++++++++++++++-------- fs/nfsd/blocklayoutxdr.h | 8 ++--- fs/nfsd/nfs4xdr.c | 7 ++-- fs/nfsd/xdr4.h | 2 +- 5 files changed, 79 insertions(+), 28 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e5c0982a381d..d40a0860fcf6 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -179,8 +179,10 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, struct iomap *iomaps; int nr_iomaps; - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, i_blocksize(inode)); + nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout, + lcp->lc_up_len, + &iomaps, + i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); @@ -317,8 +319,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct iomap *iomaps; int nr_iomaps; - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, i_blocksize(inode)); + nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout, + lcp->lc_up_len, + &iomaps, + i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 442543304930..e3e3d79c8b4f 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, } int -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size) +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, + struct iomap **iomapp, u32 block_size) { + struct xdr_stream xdr; struct iomap *iomaps; u32 nr_iomaps, i; + char scratch[sizeof(struct pnfs_block_extent)]; if (len < sizeof(u32)) { dprintk("%s: extent array too small: %u\n", __func__, len); @@ -119,7 +121,15 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, return -EINVAL; } - nr_iomaps = be32_to_cpup(p++); + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); + + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { + dprintk("%s: failed to decode extent array length\n", + __func__); + return -EINVAL; + } + if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { dprintk("%s: extent array size mismatch: %u/%u\n", __func__, len, nr_iomaps); @@ -135,28 +145,51 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { struct pnfs_block_extent bex; - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) < + sizeof(bex.vol_id)) { + dprintk("%s: failed to decode device id for entry %u\n", + __func__, i); + goto fail; + } - p = xdr_decode_hyper(p, &bex.foff); + if (xdr_stream_decode_u64(&xdr, &bex.foff)) { + dprintk("%s: failed to decode offset for entry %u\n", + __func__, i); + goto fail; + } if (bex.foff & (block_size - 1)) { dprintk("%s: unaligned offset 0x%llx\n", __func__, bex.foff); goto fail; } - p = xdr_decode_hyper(p, &bex.len); + + if (xdr_stream_decode_u64(&xdr, &bex.len)) { + dprintk("%s: failed to decode length for entry %u\n", + __func__, i); + goto fail; + } if (bex.len & (block_size - 1)) { dprintk("%s: unaligned length 0x%llx\n", __func__, bex.foff); goto fail; } - p = xdr_decode_hyper(p, &bex.soff); + + if (xdr_stream_decode_u64(&xdr, &bex.soff)) { + dprintk("%s: failed to decode soffset for entry %u\n", + __func__, i); + goto fail; + } if (bex.soff & (block_size - 1)) { dprintk("%s: unaligned disk offset 0x%llx\n", __func__, bex.soff); goto fail; } - bex.es = be32_to_cpup(p++); + + if (xdr_stream_decode_u32(&xdr, &bex.es)) { + dprintk("%s: failed to decode estate for entry %u\n", + __func__, i); + goto fail; + } if (bex.es != PNFS_BLOCK_READWRITE_DATA) { dprintk("%s: incorrect extent state %d\n", __func__, bex.es); @@ -175,18 +208,27 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, } int -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size) +nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len, + struct iomap **iomapp, u32 block_size) { + struct xdr_stream xdr; struct iomap *iomaps; u32 nr_iomaps, expected, i; + char scratch[sizeof(u64)]; if (len < sizeof(u32)) { dprintk("%s: extent array too small: %u\n", __func__, len); return -EINVAL; } - nr_iomaps = be32_to_cpup(p++); + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); + + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { + dprintk("%s: failed to decode extent array length\n", __func__); + return -EINVAL; + } + expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; if (len != expected) { dprintk("%s: extent array size mismatch: %u/%u\n", @@ -203,14 +245,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { u64 val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(&xdr, &val)) { + dprintk("%s: failed to decode offset for entry %u\n", + __func__, i); + goto fail; + } if (val & (block_size - 1)) { dprintk("%s: unaligned offset 0x%llx\n", __func__, val); goto fail; } iomaps[i].offset = val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(&xdr, &val)) { + dprintk("%s: failed to decode length for entry %u\n", + __func__, i); + goto fail; + } if (val & (block_size - 1)) { dprintk("%s: unaligned length 0x%llx\n", __func__, val); goto fail; diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index bc5166bfe46b..c4c8139b8e96 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, struct nfsd4_layoutget *lgp); -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size); -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size); +int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, + struct iomap **iomapp, u32 block_size); +int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len, + struct iomap **iomapp, u32 block_size); #endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5a93a5db4fb0..81f42dc75b95 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -592,11 +592,8 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) return nfserr_bad_xdr; - if (lcp->lc_up_len > 0) { - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); - if (!lcp->lc_up_layout) - return nfserr_bad_xdr; - } + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len)) + return nfserr_bad_xdr; return nfs_ok; } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 846ab6df9d48..8516a1a6b46d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -492,7 +492,7 @@ struct nfsd4_layoutcommit { struct timespec64 lc_mtime; /* request */ u32 lc_layout_type; /* request */ u32 lc_up_len; /* layout length */ - void *lc_up_layout; /* decoded by callback */ + struct xdr_buf lc_up_layout; /* request, decoded by callback */ u32 lc_size_chg; /* boolean for response */ u64 lc_newsize; /* response */ }; -- 2.43.0 ^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-04 13:07 [PATCH] nfsd: Implement large extent array support in pNFS Sergey Bashirov @ 2025-06-04 14:10 ` Chuck Lever 2025-06-04 14:54 ` Christoph Hellwig 1 sibling, 0 replies; 11+ messages in thread From: Chuck Lever @ 2025-06-04 14:10 UTC (permalink / raw) To: Sergey Bashirov, J . Bruce Fields; +Cc: linux-nfs, Konstantin Evtushenko On 6/4/25 9:07 AM, Sergey Bashirov wrote: > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > a variable length array of modified extents is supplied within request. > This patch allows NFS server to accept such extent arrays if they do not > fit within single memory page. > > Co-developed-by: Konstantin Evtushenko <koevtushenko@yandex.com> > Signed-off-by: Konstantin Evtushenko <koevtushenko@yandex.com> > Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com> > --- > fs/nfsd/blocklayout.c | 12 ++++--- > fs/nfsd/blocklayoutxdr.c | 78 ++++++++++++++++++++++++++++++++-------- > fs/nfsd/blocklayoutxdr.h | 8 ++--- > fs/nfsd/nfs4xdr.c | 7 ++-- > fs/nfsd/xdr4.h | 2 +- > 5 files changed, 79 insertions(+), 28 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index e5c0982a381d..d40a0860fcf6 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -179,8 +179,10 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout, > + lcp->lc_up_len, > + &iomaps, > + i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > @@ -317,8 +319,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout, > + lcp->lc_up_len, > + &iomaps, > + i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index 442543304930..e3e3d79c8b4f 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > } > > int > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size) > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size) > { > + struct xdr_stream xdr; > struct iomap *iomaps; > u32 nr_iomaps, i; > + char scratch[sizeof(struct pnfs_block_extent)]; > > if (len < sizeof(u32)) { > dprintk("%s: extent array too small: %u\n", __func__, len); > @@ -119,7 +121,15 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > return -EINVAL; > } > > - nr_iomaps = be32_to_cpup(p++); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array length\n", > + __func__); > + return -EINVAL; > + } > + > if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { > dprintk("%s: extent array size mismatch: %u/%u\n", > __func__, len, nr_iomaps); > @@ -135,28 +145,51 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > struct pnfs_block_extent bex; > > - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); > - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); > + if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) < > + sizeof(bex.vol_id)) { > + dprintk("%s: failed to decode device id for entry %u\n", > + __func__, i); > + goto fail; > + } > > - p = xdr_decode_hyper(p, &bex.foff); > + if (xdr_stream_decode_u64(&xdr, &bex.foff)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.foff & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.len); > + > + if (xdr_stream_decode_u64(&xdr, &bex.len)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.len & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.soff); > + > + if (xdr_stream_decode_u64(&xdr, &bex.soff)) { > + dprintk("%s: failed to decode soffset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.soff & (block_size - 1)) { > dprintk("%s: unaligned disk offset 0x%llx\n", > __func__, bex.soff); > goto fail; > } > - bex.es = be32_to_cpup(p++); > + > + if (xdr_stream_decode_u32(&xdr, &bex.es)) { > + dprintk("%s: failed to decode estate for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.es != PNFS_BLOCK_READWRITE_DATA) { > dprintk("%s: incorrect extent state %d\n", > __func__, bex.es); > @@ -175,18 +208,27 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > } > > int > -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size) > +nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size) > { > + struct xdr_stream xdr; > struct iomap *iomaps; > u32 nr_iomaps, expected, i; > + char scratch[sizeof(u64)]; > > if (len < sizeof(u32)) { > dprintk("%s: extent array too small: %u\n", __func__, len); > return -EINVAL; > } > > - nr_iomaps = be32_to_cpup(p++); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array length\n", __func__); > + return -EINVAL; > + } > + > expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; > if (len != expected) { > dprintk("%s: extent array size mismatch: %u/%u\n", > @@ -203,14 +245,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > u64 val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", __func__, val); > goto fail; > } > iomaps[i].offset = val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", __func__, val); > goto fail; > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index bc5166bfe46b..c4c8139b8e96 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > struct nfsd4_getdeviceinfo *gdp); > __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > struct nfsd4_layoutget *lgp); > -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > +int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size); > +int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size); > > #endif /* _NFSD_BLOCKLAYOUTXDR_H */ > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 5a93a5db4fb0..81f42dc75b95 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -592,11 +592,8 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > > if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) > return nfserr_bad_xdr; > - if (lcp->lc_up_len > 0) { > - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); > - if (!lcp->lc_up_layout) > - return nfserr_bad_xdr; > - } > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len)) > + return nfserr_bad_xdr; > > return nfs_ok; > } > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h > index 846ab6df9d48..8516a1a6b46d 100644 > --- a/fs/nfsd/xdr4.h > +++ b/fs/nfsd/xdr4.h > @@ -492,7 +492,7 @@ struct nfsd4_layoutcommit { > struct timespec64 lc_mtime; /* request */ > u32 lc_layout_type; /* request */ > u32 lc_up_len; /* layout length */ > - void *lc_up_layout; /* decoded by callback */ > + struct xdr_buf lc_up_layout; /* request, decoded by callback */ > u32 lc_size_chg; /* boolean for response */ > u64 lc_newsize; /* response */ > }; Thanks for the suggestion, Sergey! Note the MAINTAINERS entry for NFSD: $ scripts/get_maintainer.pl fs/nfsd/vfs.c Chuck Lever <chuck.lever@oracle.com> (maintainer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Jeff Layton <jlayton@kernel.org> (maintainer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Neil Brown <neilb@suse.de> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Olga Kornievskaia <okorniev@redhat.com> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Dai Ngo <Dai.Ngo@oracle.com> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Tom Talpey <tom@talpey.com> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) linux-nfs@vger.kernel.org (open list:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) linux-kernel@vger.kernel.org (open list) KERNEL NFSD, SUNRPC, AND LOCKD SERVERS status: Supported In particular, Dai is looking at the Linux NFS server's pNFS with iSCSI implementation right at the moment and might have some thoughts about expanding the number of extents in block layouts. Can you repost your patch with the current reviewers and maintainers copied as appropriate? -- Chuck Lever ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-04 13:07 [PATCH] nfsd: Implement large extent array support in pNFS Sergey Bashirov 2025-06-04 14:10 ` Chuck Lever @ 2025-06-04 14:54 ` Christoph Hellwig 2025-06-10 0:36 ` Sergey Bashirov 1 sibling, 1 reply; 11+ messages in thread From: Christoph Hellwig @ 2025-06-04 14:54 UTC (permalink / raw) To: Sergey Bashirov Cc: J . Bruce Fields, Chuck Lever, linux-nfs, Konstantin Evtushenko On Wed, Jun 04, 2025 at 04:07:08PM +0300, Sergey Bashirov wrote: > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > a variable length array of modified extents is supplied within request. > This patch allows NFS server to accept such extent arrays if they do not > fit within single memory page. How did you manage to trigger such a workload? Also you patch doesn't apply to current mainline. > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout, > + lcp->lc_up_len, > + &iomaps, > + i_blocksize(inode)); Please drop all the spurious reformatting to a harder to read style. > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > } > > int > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size) > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size) > { So if you pass the xdr_buf that already has the length, can't we drop the len argument? > + struct xdr_stream xdr; Also I'm not the expert on the xdr_stream API, but instead of setting up a sub-buffer here shouldn't we reuse that from the actual XDR decoding stage, and if that can't work (Chuck is probably a better source for know how on that then me), at least initialize it in the main nfsd layoutcommit handler rather than in the layout drivers. me we'd probably want that initialized in the core nfsd code and not the layout driver. > + if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) < Overly long line. > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len)) Same. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-04 14:54 ` Christoph Hellwig @ 2025-06-10 0:36 ` Sergey Bashirov 2025-06-10 5:39 ` Christoph Hellwig 0 siblings, 1 reply; 11+ messages in thread From: Sergey Bashirov @ 2025-06-10 0:36 UTC (permalink / raw) To: Christoph Hellwig, Chuck Lever Cc: J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Wed, Jun 04, 2025 at 10:10:15AM -0400, Chuck Lever wrote: > Can you repost your patch with the current reviewers and maintainers > copied as appropriate? Sorry for the newbie mistake, this is the first patch I'm sending upstream. Yes, I will send patch v2 with the right reviewers and maintainers. On Wed, Jun 04, 2025 at 07:54:31AM -0700, Christoph Hellwig wrote: > On Wed, Jun 04, 2025 at 04:07:08PM +0300, Sergey Bashirov wrote: > > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > > a variable length array of modified extents is supplied within request. > > This patch allows NFS server to accept such extent arrays if they do not > > fit within single memory page. > > How did you manage to trigger such a workload? Together with Konstantin we spent a lot of time enabling the pNFS block volume setup. We have SDS that can attach virtual block devices via vhost-user-blk to virtual machines. And we researched the way to create parallel or distributed file system on top of this SDS. From this point of view, pNFS block volume layout architecture looks quite suitable. So, we created several VMs, configured pNFS and started testing. In fact, during our extensive testing, we encountered a variety of issues including deadlocks, livelocks, and corrupted files, which we eventually fixed. Now we have a working setup and we would like to clean up the code and contribute it. > Also you patch doesn't apply to current mainline. We will use the git repository link for NFSD from the MAINTAINERS file and "nfsd-next" branch to work on top of it. Please let me know if this is the wrong repository or branch in our case. > > +++ b/fs/nfsd/blocklayoutxdr.c > > @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > > } > > > > int > > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > > - u32 block_size) > > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, > > + struct iomap **iomapp, u32 block_size) > > { > > So if you pass the xdr_buf that already has the length, can't we drop > the len argument? Thanks for the suggestions! I will rework it in patch v2 to get rid of the len argument and the lc_up_len field. > > + struct xdr_stream xdr; > > Also I'm not the expert on the xdr_stream API, but instead of setting > up a sub-buffer here shouldn't we reuse that from the actual XDR > decoding stage, and if that can't work (Chuck is probably a better > source for know how on that then me), at least initialize it in the > main nfsd layoutcommit handler rather than in the layout drivers. > me we'd probably want that initialized in the core nfsd code and not > the layout driver. As for the sub-buffer, the xdr_buf structure is initialized in the core nfsd code to point only to the "opaque" field of the "layoutupdate4" structure. Since this field is specific to each layout driver, its xdr_stream is created on demand inside the field handler. For example, the "opaque" field is not used in the file layouts. Do we really need to expose the xdr_stream outside the field handler? Probably not. I also checked how this is implemented in the nfs client code and found that xdr_stream is created in a similar way inside the layout driver. Below I have outlined some thoughts on why implemented it this way. Please correct me if I missed anything. Let's say we have a layout commit with 1000 extents (in practice we observed much more). The size of each block extent structure is 44 bytes, and we have another 4 bytes for the total number of extents. So, the "opaque" field of the "layoutupdate4" structure has a size of 44004 bytes. In this case, we cannot simply borrow a pointer to the xdr internal page or the scratch buffer using xdr_inline_decode(). What options do we have? 1. Allocate a large enough memory buffer and copy the "opaque" field into it. But I think an extra copy of a large field is not what we prefer. 2. When RPC is received, nfsd_dispatch() first decodes the entire compound request and only then processes each operation. Yes, we can create a new callback in the layout driver interface to decode the "opaque" field during the decoding phase and use the actual xdr stream of the request. What I don't like here is that the layout driver is forced to parse a large data buffer before general checks are done (sequence ID, file handler, state ID, range, grace period, etc.). This opens up opportunities to abuse the server by sending invalid layout commits with the maximum possible number of extents (RPC can be up to 1MB). 3. It is also possible to store only the position of the "opaque" field in the actual xdr stream of the request during the decoding phase and defer large data buffer parsing until the processing phase. This is what the original code does, and this patch takes it in the same direction. -- Sergey Bashirov ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-10 0:36 ` Sergey Bashirov @ 2025-06-10 5:39 ` Christoph Hellwig 2025-06-10 15:24 ` Sergey Bashirov 0 siblings, 1 reply; 11+ messages in thread From: Christoph Hellwig @ 2025-06-10 5:39 UTC (permalink / raw) To: Sergey Bashirov Cc: Christoph Hellwig, Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Tue, Jun 10, 2025 at 03:36:49AM +0300, Sergey Bashirov wrote: > On Wed, Jun 04, 2025 at 07:54:31AM -0700, Christoph Hellwig wrote: > > On Wed, Jun 04, 2025 at 04:07:08PM +0300, Sergey Bashirov wrote: > > > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > > > a variable length array of modified extents is supplied within request. > > > This patch allows NFS server to accept such extent arrays if they do not > > > fit within single memory page. > > > > How did you manage to trigger such a workload? > > Together with Konstantin we spent a lot of time enabling the pNFS block > volume setup. We have SDS that can attach virtual block devices via > vhost-user-blk to virtual machines. And we researched the way to create > parallel or distributed file system on top of this SDS. From this point > of view, pNFS block volume layout architecture looks quite suitable. So, > we created several VMs, configured pNFS and started testing. In fact, > during our extensive testing, we encountered a variety of issues including > deadlocks, livelocks, and corrupted files, which we eventually fixed. > Now we have a working setup and we would like to clean up the code and > contribute it. Can you share your reproducer scripts for client and server? Btw, also as a little warning: the current pNFS code mean any client can corrupt the XFS metadata. If you want to actually use the code in production you'll probably want to figure out a way to either use the RT device for exposed data (should be easy, but the RT allocator sucks..), or find a way to otherwise restrict clients from overwriting metadata. > > Also you patch doesn't apply to current mainline. > > We will use the git repository link for NFSD from the MAINTAINERS file and > "nfsd-next" branch to work on top of it. Please let me know if this is the > wrong repository or branch in our case. I guess that's generally fine, but around a -rc1 release it's a bit cumbersome. > As for the sub-buffer, the xdr_buf structure is initialized in the core > nfsd code to point only to the "opaque" field of the "layoutupdate4" > structure. Since this field is specific to each layout driver, its > xdr_stream is created on demand inside the field handler. For example, > the "opaque" field is not used in the file layouts. Do we really need to > expose the xdr_stream outside the field handler? Probably not. I also > checked how this is implemented in the nfs client code and found that > xdr_stream is created in a similar way inside the layout driver. Below > I have outlined some thoughts on why implemented it this way. Please > correct me if I missed anything. Well, the fields are opaque, but everyone has to either decode (or ignore it). So having common setup sounds useful. > 1. Allocate a large enough memory buffer and copy the "opaque" field into > it. But I think an extra copy of a large field is not what we prefer. Agreed. > 2. When RPC is received, nfsd_dispatch() first decodes the entire compound > request and only then processes each operation. Yes, we can create a new > callback in the layout driver interface to decode the "opaque" field > during the decoding phase and use the actual xdr stream of the request. > What I don't like here is that the layout driver is forced to parse a > large data buffer before general checks are done (sequence ID, file > handler, state ID, range, grace period, etc.). This opens up > opportunities to abuse the server by sending invalid layout commits with > the maximum possible number of extents (RPC can be up to 1MB). OTOH the same happens for parsing any other NFS compound that isn't split into layouts, isn't it? And we have total size limits on the transfer. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-10 5:39 ` Christoph Hellwig @ 2025-06-10 15:24 ` Sergey Bashirov 2025-06-11 6:55 ` Christoph Hellwig 0 siblings, 1 reply; 11+ messages in thread From: Sergey Bashirov @ 2025-06-10 15:24 UTC (permalink / raw) To: Christoph Hellwig Cc: Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Mon, Jun 09, 2025 at 10:39:06PM -0700, Christoph Hellwig wrote: > On Tue, Jun 10, 2025 at 03:36:49AM +0300, Sergey Bashirov wrote: > > Together with Konstantin we spent a lot of time enabling the pNFS block > > volume setup. We have SDS that can attach virtual block devices via > > vhost-user-blk to virtual machines. And we researched the way to create > > parallel or distributed file system on top of this SDS. From this point > > of view, pNFS block volume layout architecture looks quite suitable. So, > > we created several VMs, configured pNFS and started testing. In fact, > > during our extensive testing, we encountered a variety of issues including > > deadlocks, livelocks, and corrupted files, which we eventually fixed. > > Now we have a working setup and we would like to clean up the code and > > contribute it. > > Can you share your reproducer scripts for client and server? I will try. First of all, you need two VMs connected to the same network. The hardest part is somehow to connect a shared block device to both VMs with RW access. As I mentioned, we use a proprietary SDS for it. Note that if you pass the device to the VM as virtio-blk, the client will select the block layout driver, and if you pass the device to the VM as virtio-scsi, the client will select the scsi layout driver. Script I use to start VMs (both use the same parameters, only the names, boot images and network addresses differ): #!/bin/sh BOOT_DISK="-drive format=qcow2,file=pnfs_server.img" SSH_NET="-device virtio-net-pci,netdev=net0,mac=52:54:00:12:34:57 -netdev user,id=net0,hostfwd=tcp::20001-:22" NFS_NET="-device e1000,netdev=net1,mac=52:54:00:12:34:67 -netdev socket,id=net1,listen=:1234" MP="/dev/hugepages/libvirt/qemu" VHOST_DISK="-object memory-backend-file,id=mem,size=8G,mem-path=$MP,share=on -numa node,memdev=mem -chardev socket,id=char1,path=/var/lib/storage/pnfs_server.sock -device vhost-user-blk-pci,id=blk1,chardev=char1,num-queues=16" qemu-system-x86_64 -daemonize -display none -name pnfs_server \ -cpu host -enable-kvm -smp 8 -m 8G \ $BOOT_DISK $SSH_NET $VHOST_DISK $NFS_NET The server's /etc/nfs.conf: [nfsd] grace-time=90 lease-time=120 vers2=n vers3=y vers4=n vers4.0=n vers4.1=n vers4.2=y The server's /etc/exports: /mnt/export *(pnfs,rw,sync,insecure,no_root_squash,no_subtree_check) Please note that the block volume layout does not support partition tables, volume groups, RAID, etc. So you need to create XFS on the raw block device. In my case shared virtio-blk disk is /dev/vda. And the file system can be prepared by following these steps: sudo mkfs -t xfs /dev/vda sudo mkdir -p /mnt/export sudo mount -t xfs /dev/vda /mnt/export After these steps you can start or restart the server: sudo systemctl restart nfs-kernel-server On the client side, you need to have the same /dev/vda device available, but not mounted. Additionally, you need the blkmapd service running. Perform the following steps to mount the share: sudo systemctl start nfs-blkmap sudo mkdir -p /mnt/pnfs sudo mount -t nfs4 -v -o minorversion=2,sync,hard,noatime, rsize=1048576,wsize=1048576,timeo=600, retrans=2 192.168.1.1:/mnt/export /mnt/pnfs This should create 2.5k extents: fio --name=test --filename=/mnt/pnfs/test.raw --size=10M \ --rw=randwrite --ioengine=libaio --direct=1 --bs=4k \ --iodepth=128 --fallocate=none You can check it on the server side: xfs_bmap -elpvv /mnt/export/test.raw Troubleshooting. If any error occurs, then kernel falls back to NFSv3. Use nfsstat or mountstats to view RPC counters. Normally the READ and WRITE counters should be zero, and the LAYOUTGET, LAYOUTCOMMIT, LAYOUTRETURN should increase as you work with files. If the network connection and shared volume are working fine, then first of all check the status of blkmapd, most probably its fault is the reason. Note that the client code also has problems with the block extent array. Currently the client tries to pack all the block extents it needs to commit into one RPC. And if there are too many of them, you will see "RPC: fragment too large" error on the server side. That's why we set rsize and wsize to 1M for now. Another problem is that when the extent array does not fit into a single memory page, the client code discards the first page of encoded extents while reallocating a larger buffer to continue layout commit encoding. So even with this patch you may still notice that some files are not written correctly. But at least the server shouldn't send the badxdr error on a well-formed layout commit. > Btw, also as a little warning: the current pNFS code mean any client > can corrupt the XFS metadata. If you want to actually use the code > in production you'll probably want to figure out a way to either use > the RT device for exposed data (should be easy, but the RT allocator > sucks..), or find a way to otherwise restrict clients from overwriting > metadata. Thanks for the advice! Yes, we have had issues with XFS corruption especially when multiple clients were writing to the same file in parallel. Spent some time debugging layout recalls and client fencing to figure out what happened. > > As for the sub-buffer, the xdr_buf structure is initialized in the core > > nfsd code to point only to the "opaque" field of the "layoutupdate4" > > structure. Since this field is specific to each layout driver, its > > xdr_stream is created on demand inside the field handler. For example, > > the "opaque" field is not used in the file layouts. Do we really need to > > expose the xdr_stream outside the field handler? Probably not. I also > > checked how this is implemented in the nfs client code and found that > > xdr_stream is created in a similar way inside the layout driver. Below > > I have outlined some thoughts on why implemented it this way. Please > > correct me if I missed anything. > > Well, the fields are opaque, but everyone has to either decode (or > ignore it). So having common setup sounds useful. > > > 2. When RPC is received, nfsd_dispatch() first decodes the entire compound > > request and only then processes each operation. Yes, we can create a new > > callback in the layout driver interface to decode the "opaque" field > > during the decoding phase and use the actual xdr stream of the request. > > What I don't like here is that the layout driver is forced to parse a > > large data buffer before general checks are done (sequence ID, file > > handler, state ID, range, grace period, etc.). This opens up > > opportunities to abuse the server by sending invalid layout commits with > > the maximum possible number of extents (RPC can be up to 1MB). > > OTOH the same happens for parsing any other NFS compound that isn't > split into layouts, isn't it? And we have total size limits on the > transfer. I agree, one large request and 1000 small requests look the same on the wire. So, setting up an xdr_stream at a higher level requires adding it to the nfsd4_layoutcommit strucutre. Either as a substructure, which will significantly increase the overall size of the layout commit argument, or as a pointer, which will require some memory allocation and deallocation logic. Also, in the core nfsd code we don't know the sufficient scratch buffer size for a particular layout driver, most likely we will allocate a page for it. This all seems a bit overengineered compared to two local variables on the stack. I will think about it a little more. -- Sergey Bashirov ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-10 15:24 ` Sergey Bashirov @ 2025-06-11 6:55 ` Christoph Hellwig 2025-06-11 12:19 ` Sergey Bashirov 2025-06-11 13:53 ` Chuck Lever 0 siblings, 2 replies; 11+ messages in thread From: Christoph Hellwig @ 2025-06-11 6:55 UTC (permalink / raw) To: Sergey Bashirov Cc: Christoph Hellwig, Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Tue, Jun 10, 2025 at 06:24:03PM +0300, Sergey Bashirov wrote: > On Mon, Jun 09, 2025 at 10:39:06PM -0700, Christoph Hellwig wrote: > > On Tue, Jun 10, 2025 at 03:36:49AM +0300, Sergey Bashirov wrote: > > > Together with Konstantin we spent a lot of time enabling the pNFS block > > > volume setup. We have SDS that can attach virtual block devices via > > > vhost-user-blk to virtual machines. And we researched the way to create > > > parallel or distributed file system on top of this SDS. From this point > > > of view, pNFS block volume layout architecture looks quite suitable. So, > > > we created several VMs, configured pNFS and started testing. In fact, > > > during our extensive testing, we encountered a variety of issues including > > > deadlocks, livelocks, and corrupted files, which we eventually fixed. > > > Now we have a working setup and we would like to clean up the code and > > > contribute it. > > > > Can you share your reproducer scripts for client and server? > > I will try. First of all, you need two VMs connected to the same network. > The hardest part is somehow to connect a shared block device to both VMs > with RW access. I know the basic setup :) > On the client side, you need to have the same /dev/vda device available, > but not mounted. Additionally, you need the blkmapd service running. blkmapd is only needed for the block layout, which should generally be avoided as it can't be used reliably because working fencing is almost impossible. > This should create 2.5k extents: > fio --name=test --filename=/mnt/pnfs/test.raw --size=10M \ > --rw=randwrite --ioengine=libaio --direct=1 --bs=4k \ > --iodepth=128 --fallocate=none Thanks! We should find a way to wire up the test coverage somewhere, e.g. xfstests. > Troubleshooting. If any error occurs, then kernel falls back to NFSv3. That should really be NFSv4. > the client code also has problems with the block extent array. Currently > the client tries to pack all the block extents it needs to commit into > one RPC. And if there are too many of them, you will see > "RPC: fragment too large" error on the server side. That's why > we set rsize and wsize to 1M for now. We'll really need to fix the client to split when going over the maximum compoung size. > Another problem is that when the > extent array does not fit into a single memory page, the client code > discards the first page of encoded extents while reallocating a larger > buffer to continue layout commit encoding. So even with this patch you > may still notice that some files are not written correctly. But at least > the server shouldn't send the badxdr error on a well-formed layout commit. Eww, we'll need to fix that as well. Would be good to have a reproducer for that case as well. > > Btw, also as a little warning: the current pNFS code mean any client > > can corrupt the XFS metadata. If you want to actually use the code > > in production you'll probably want to figure out a way to either use > > the RT device for exposed data (should be easy, but the RT allocator > > sucks..), or find a way to otherwise restrict clients from overwriting > > metadata. > > Thanks for the advice! Yes, we have had issues with XFS corruption > especially when multiple clients were writing to the same file in > parallel. Spent some time debugging layout recalls and client fencing > to figure out what happened. Normal operation should not cause that, what did you see there? I mean a malicious client targeting metadata outside it's layout. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-11 6:55 ` Christoph Hellwig @ 2025-06-11 12:19 ` Sergey Bashirov 2025-06-12 6:33 ` Christoph Hellwig 2025-06-11 13:53 ` Chuck Lever 1 sibling, 1 reply; 11+ messages in thread From: Sergey Bashirov @ 2025-06-11 12:19 UTC (permalink / raw) To: Christoph Hellwig Cc: Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Tue, Jun 10, 2025 at 11:55:09PM -0700, Christoph Hellwig wrote: > On Tue, Jun 10, 2025 at 06:24:03PM +0300, Sergey Bashirov wrote: > > the client code also has problems with the block extent array. Currently > > the client tries to pack all the block extents it needs to commit into > > one RPC. And if there are too many of them, you will see > > "RPC: fragment too large" error on the server side. That's why > > we set rsize and wsize to 1M for now. > > We'll really need to fix the client to split when going over the maximum > compoung size. Yes, working on patches to send for review. > > Another problem is that when the > > extent array does not fit into a single memory page, the client code > > discards the first page of encoded extents while reallocating a larger > > buffer to continue layout commit encoding. So even with this patch you > > may still notice that some files are not written correctly. But at least > > the server shouldn't send the badxdr error on a well-formed layout commit. > > Eww, we'll need to fix that as well. Would be good to have a reproducer > for that case as well. Will prepare and send patches too. The reproducer should be the same as I send for the server. Just try to check what FIO has written with the additional option --verify=crc32c. > > Thanks for the advice! Yes, we have had issues with XFS corruption > > especially when multiple clients were writing to the same file in > > parallel. Spent some time debugging layout recalls and client fencing > > to figure out what happened. > > Normal operation should not cause that, what did you see there? I think, this is not an NFS implementation issue, but rather a question of how to properly implement the client fencing. In a distributed storage system, there is a delay between the time NFS server requests a blocking of writes to a shared volume for a particular client and the time that blocking takes effect. If we choose an optimistic approach and assume that fencing is done by simply sending a request (without waiting for actual processing by the underlying storage system), then we might end up in the following situation. Let's think of layoutget as a byte range locking mechanism in terms of writing to a single file from multiple clients. First of all, a client writing without O_DIRECT through the page cache will delay processing of the layout recall for too long if its user-space application really writes a lot. As a consequences we observe significant performance degradation, and sometimes the server decides that the client is not responding at all and tries to fence it to allow the second client to get the lock. At this moment we still have the first client writing, and the server releasing the xfs_lease associated with the layout of the first client. And if we are really unlucky, XFS might reallocate extents, so the first client will be writing outside the file. And if we are really, really unlucky, XFS might put some metadata there as well. -- Sergey Bashirov ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-11 12:19 ` Sergey Bashirov @ 2025-06-12 6:33 ` Christoph Hellwig 2025-06-12 8:13 ` Sergey Bashirov 0 siblings, 1 reply; 11+ messages in thread From: Christoph Hellwig @ 2025-06-12 6:33 UTC (permalink / raw) To: Sergey Bashirov Cc: Christoph Hellwig, Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Wed, Jun 11, 2025 at 03:19:29PM +0300, Sergey Bashirov wrote: > > Normal operation should not cause that, what did you see there? > > I think, this is not an NFS implementation issue, but rather a question > of how to properly implement the client fencing. In a distributed > storage system, there is a delay between the time NFS server requests > a blocking of writes to a shared volume for a particular client and the > time that blocking takes effect. If we choose an optimistic approach and > assume that fencing is done by simply sending a request (without waiting > for actual processing by the underlying storage system), then we might > end up in the following situation. I guess this is using block layout and your own fencing? Because with the SCSI layout we fence right from the kernel path before force returning the layout. The fact that block layout can't do reliable fencing is the reason why I came up with the SCSI layout, that can. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-12 6:33 ` Christoph Hellwig @ 2025-06-12 8:13 ` Sergey Bashirov 0 siblings, 0 replies; 11+ messages in thread From: Sergey Bashirov @ 2025-06-12 8:13 UTC (permalink / raw) To: Christoph Hellwig Cc: Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs On Wed, Jun 11, 2025 at 11:33:27PM -0700, Christoph Hellwig wrote: > On Wed, Jun 11, 2025 at 03:19:29PM +0300, Sergey Bashirov wrote: > > > Normal operation should not cause that, what did you see there? > > > > I think, this is not an NFS implementation issue, but rather a question > > of how to properly implement the client fencing. In a distributed > > storage system, there is a delay between the time NFS server requests > > a blocking of writes to a shared volume for a particular client and the > > time that blocking takes effect. If we choose an optimistic approach and > > assume that fencing is done by simply sending a request (without waiting > > for actual processing by the underlying storage system), then we might > > end up in the following situation. > > I guess this is using block layout and your own fencing? Because > with the SCSI layout we fence right from the kernel path before > force returning the layout. The fact that block layout can't do > reliable fencing is the reason why I came up with the SCSI layout, > that can. Yes, you are right. By the way, even with SCSI Persistent Reservations the fencing is not entirely clean and simple. We tried a third party enterprise storage system to test the scsi layout. But it seems that SCSI PR implementation there is imperfect. We occasionally observed PR_KEYs being erroneously revoked by the storage system. But the NFS part of this setup worked fine. -- Sergey Bashirov ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] nfsd: Implement large extent array support in pNFS 2025-06-11 6:55 ` Christoph Hellwig 2025-06-11 12:19 ` Sergey Bashirov @ 2025-06-11 13:53 ` Chuck Lever 1 sibling, 0 replies; 11+ messages in thread From: Chuck Lever @ 2025-06-11 13:53 UTC (permalink / raw) To: Christoph Hellwig, Sergey Bashirov Cc: J . Bruce Fields, Konstantin Evtushenko, linux-nfs On 6/11/25 2:55 AM, Christoph Hellwig wrote: > On Tue, Jun 10, 2025 at 06:24:03PM +0300, Sergey Bashirov wrote: >> >> This should create 2.5k extents: >> fio --name=test --filename=/mnt/pnfs/test.raw --size=10M \ >> --rw=randwrite --ioengine=libaio --direct=1 --bs=4k \ >> --iodepth=128 --fallocate=none > > Thanks! We should find a way to wire up the test coverage > somewhere, e.g. xfstests. I implemented support for setting up pNFS with iSCSI layouts in kdevops last summer. Unfortunately since then I've been distracted by trying to find enough hardware resources to add it to the regular matrix of tests. -- Chuck Lever ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2025-06-12 8:13 UTC | newest] Thread overview: 11+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2025-06-04 13:07 [PATCH] nfsd: Implement large extent array support in pNFS Sergey Bashirov 2025-06-04 14:10 ` Chuck Lever 2025-06-04 14:54 ` Christoph Hellwig 2025-06-10 0:36 ` Sergey Bashirov 2025-06-10 5:39 ` Christoph Hellwig 2025-06-10 15:24 ` Sergey Bashirov 2025-06-11 6:55 ` Christoph Hellwig 2025-06-11 12:19 ` Sergey Bashirov 2025-06-12 6:33 ` Christoph Hellwig 2025-06-12 8:13 ` Sergey Bashirov 2025-06-11 13:53 ` Chuck Lever
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox