[PATCH] nfsd: Implement large extent array support in pNFS

public inbox for linux-nfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] nfsd: Implement large extent array support in pNFS
@ 2025-06-04 13:07 Sergey Bashirov
  2025-06-04 14:10 ` Chuck Lever
  2025-06-04 14:54 ` Christoph Hellwig
  0 siblings, 2 replies; 11+ messages in thread
From: Sergey Bashirov @ 2025-06-04 13:07 UTC (permalink / raw)
  To: J . Bruce Fields, Chuck Lever
  Cc: linux-nfs, Sergey Bashirov, Konstantin Evtushenko

When pNFS client in block layout mode sends layoutcommit RPC to MDS,
a variable length array of modified extents is supplied within request.
This patch allows NFS server to accept such extent arrays if they do not
fit within single memory page.

Co-developed-by: Konstantin Evtushenko <koevtushenko@yandex.com>
Signed-off-by: Konstantin Evtushenko <koevtushenko@yandex.com>
Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
---
 fs/nfsd/blocklayout.c    | 12 ++++---
 fs/nfsd/blocklayoutxdr.c | 78 ++++++++++++++++++++++++++++++++--------
 fs/nfsd/blocklayoutxdr.h |  8 ++---
 fs/nfsd/nfs4xdr.c        |  7 ++--
 fs/nfsd/xdr4.h           |  2 +-
 5 files changed, 79 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e5c0982a381d..d40a0860fcf6 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -179,8 +179,10 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
 	struct iomap *iomaps;
 	int nr_iomaps;
 
-	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
-			lcp->lc_up_len, &iomaps, i_blocksize(inode));
+	nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout,
+						    lcp->lc_up_len,
+						    &iomaps,
+						    i_blocksize(inode));
 	if (nr_iomaps < 0)
 		return nfserrno(nr_iomaps);
 
@@ -317,8 +319,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
 	struct iomap *iomaps;
 	int nr_iomaps;
 
-	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
-			lcp->lc_up_len, &iomaps, i_blocksize(inode));
+	nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout,
+						   lcp->lc_up_len,
+						   &iomaps,
+						   i_blocksize(inode));
 	if (nr_iomaps < 0)
 		return nfserrno(nr_iomaps);
 
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 442543304930..e3e3d79c8b4f 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
 }
 
 int
-nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
-		u32 block_size)
+nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len,
+				struct iomap **iomapp, u32 block_size)
 {
+	struct xdr_stream xdr;
 	struct iomap *iomaps;
 	u32 nr_iomaps, i;
+	char scratch[sizeof(struct pnfs_block_extent)];
 
 	if (len < sizeof(u32)) {
 		dprintk("%s: extent array too small: %u\n", __func__, len);
@@ -119,7 +121,15 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		return -EINVAL;
 	}
 
-	nr_iomaps = be32_to_cpup(p++);
+	xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
+	xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
+
+	if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
+		dprintk("%s: failed to decode extent array length\n",
+			__func__);
+		return -EINVAL;
+	}
+
 	if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
 			__func__, len, nr_iomaps);
@@ -135,28 +145,51 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 	for (i = 0; i < nr_iomaps; i++) {
 		struct pnfs_block_extent bex;
 
-		memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
-		p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+		if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) <
+		    sizeof(bex.vol_id)) {
+			dprintk("%s: failed to decode device id for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 
-		p = xdr_decode_hyper(p, &bex.foff);
+		if (xdr_stream_decode_u64(&xdr, &bex.foff)) {
+			dprintk("%s: failed to decode offset for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 		if (bex.foff & (block_size - 1)) {
 			dprintk("%s: unaligned offset 0x%llx\n",
 				__func__, bex.foff);
 			goto fail;
 		}
-		p = xdr_decode_hyper(p, &bex.len);
+
+		if (xdr_stream_decode_u64(&xdr, &bex.len)) {
+			dprintk("%s: failed to decode length for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 		if (bex.len & (block_size - 1)) {
 			dprintk("%s: unaligned length 0x%llx\n",
 				__func__, bex.foff);
 			goto fail;
 		}
-		p = xdr_decode_hyper(p, &bex.soff);
+
+		if (xdr_stream_decode_u64(&xdr, &bex.soff)) {
+			dprintk("%s: failed to decode soffset for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 		if (bex.soff & (block_size - 1)) {
 			dprintk("%s: unaligned disk offset 0x%llx\n",
 				__func__, bex.soff);
 			goto fail;
 		}
-		bex.es = be32_to_cpup(p++);
+
+		if (xdr_stream_decode_u32(&xdr, &bex.es)) {
+			dprintk("%s: failed to decode estate for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 		if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
 			dprintk("%s: incorrect extent state %d\n",
 				__func__, bex.es);
@@ -175,18 +208,27 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 }
 
 int
-nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
-		u32 block_size)
+nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len,
+			       struct iomap **iomapp, u32 block_size)
 {
+	struct xdr_stream xdr;
 	struct iomap *iomaps;
 	u32 nr_iomaps, expected, i;
+	char scratch[sizeof(u64)];
 
 	if (len < sizeof(u32)) {
 		dprintk("%s: extent array too small: %u\n", __func__, len);
 		return -EINVAL;
 	}
 
-	nr_iomaps = be32_to_cpup(p++);
+	xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
+	xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
+
+	if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
+		dprintk("%s: failed to decode extent array length\n", __func__);
+		return -EINVAL;
+	}
+
 	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
 	if (len != expected) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
@@ -203,14 +245,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 	for (i = 0; i < nr_iomaps; i++) {
 		u64 val;
 
-		p = xdr_decode_hyper(p, &val);
+		if (xdr_stream_decode_u64(&xdr, &val)) {
+			dprintk("%s: failed to decode offset for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 		if (val & (block_size - 1)) {
 			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
 			goto fail;
 		}
 		iomaps[i].offset = val;
 
-		p = xdr_decode_hyper(p, &val);
+		if (xdr_stream_decode_u64(&xdr, &val)) {
+			dprintk("%s: failed to decode length for entry %u\n",
+				__func__, i);
+			goto fail;
+		}
 		if (val & (block_size - 1)) {
 			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
 			goto fail;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index bc5166bfe46b..c4c8139b8e96 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
 		struct nfsd4_getdeviceinfo *gdp);
 __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		struct nfsd4_layoutget *lgp);
-int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
-		u32 block_size);
-int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
-		u32 block_size);
+int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len,
+		struct iomap **iomapp, u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len,
+		struct iomap **iomapp, u32 block_size);
 
 #endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5a93a5db4fb0..81f42dc75b95 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -592,11 +592,8 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
 
 	if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0)
 		return nfserr_bad_xdr;
-	if (lcp->lc_up_len > 0) {
-		lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len);
-		if (!lcp->lc_up_layout)
-			return nfserr_bad_xdr;
-	}
+	if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len))
+		return nfserr_bad_xdr;
 
 	return nfs_ok;
 }
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 846ab6df9d48..8516a1a6b46d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -492,7 +492,7 @@ struct nfsd4_layoutcommit {
 	struct timespec64	lc_mtime;	/* request */
 	u32			lc_layout_type;	/* request */
 	u32			lc_up_len;	/* layout length */
-	void			*lc_up_layout;	/* decoded by callback */
+	struct xdr_buf		lc_up_layout;	/* request, decoded by callback */
 	u32			lc_size_chg;	/* boolean for response */
 	u64			lc_newsize;	/* response */
 };
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-04 13:07 [PATCH] nfsd: Implement large extent array support in pNFS Sergey Bashirov
@ 2025-06-04 14:10 ` Chuck Lever
  2025-06-04 14:54 ` Christoph Hellwig
  1 sibling, 0 replies; 11+ messages in thread
From: Chuck Lever @ 2025-06-04 14:10 UTC (permalink / raw)
  To: Sergey Bashirov, J . Bruce Fields; +Cc: linux-nfs, Konstantin Evtushenko

On 6/4/25 9:07 AM, Sergey Bashirov wrote:
> When pNFS client in block layout mode sends layoutcommit RPC to MDS,
> a variable length array of modified extents is supplied within request.
> This patch allows NFS server to accept such extent arrays if they do not
> fit within single memory page.
> 
> Co-developed-by: Konstantin Evtushenko <koevtushenko@yandex.com>
> Signed-off-by: Konstantin Evtushenko <koevtushenko@yandex.com>
> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
> ---
>  fs/nfsd/blocklayout.c    | 12 ++++---
>  fs/nfsd/blocklayoutxdr.c | 78 ++++++++++++++++++++++++++++++++--------
>  fs/nfsd/blocklayoutxdr.h |  8 ++---
>  fs/nfsd/nfs4xdr.c        |  7 ++--
>  fs/nfsd/xdr4.h           |  2 +-
>  5 files changed, 79 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index e5c0982a381d..d40a0860fcf6 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -179,8 +179,10 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
>  	struct iomap *iomaps;
>  	int nr_iomaps;
>  
> -	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> -			lcp->lc_up_len, &iomaps, i_blocksize(inode));
> +	nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout,
> +						    lcp->lc_up_len,
> +						    &iomaps,
> +						    i_blocksize(inode));
>  	if (nr_iomaps < 0)
>  		return nfserrno(nr_iomaps);
>  
> @@ -317,8 +319,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
>  	struct iomap *iomaps;
>  	int nr_iomaps;
>  
> -	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
> -			lcp->lc_up_len, &iomaps, i_blocksize(inode));
> +	nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout,
> +						   lcp->lc_up_len,
> +						   &iomaps,
> +						   i_blocksize(inode));
>  	if (nr_iomaps < 0)
>  		return nfserrno(nr_iomaps);
>  
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> index 442543304930..e3e3d79c8b4f 100644
> --- a/fs/nfsd/blocklayoutxdr.c
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
>  }
>  
>  int
> -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> -		u32 block_size)
> +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len,
> +				struct iomap **iomapp, u32 block_size)
>  {
> +	struct xdr_stream xdr;
>  	struct iomap *iomaps;
>  	u32 nr_iomaps, i;
> +	char scratch[sizeof(struct pnfs_block_extent)];
>  
>  	if (len < sizeof(u32)) {
>  		dprintk("%s: extent array too small: %u\n", __func__, len);
> @@ -119,7 +121,15 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  		return -EINVAL;
>  	}
>  
> -	nr_iomaps = be32_to_cpup(p++);
> +	xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
> +	xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
> +
> +	if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
> +		dprintk("%s: failed to decode extent array length\n",
> +			__func__);
> +		return -EINVAL;
> +	}
> +
>  	if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
>  		dprintk("%s: extent array size mismatch: %u/%u\n",
>  			__func__, len, nr_iomaps);
> @@ -135,28 +145,51 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  	for (i = 0; i < nr_iomaps; i++) {
>  		struct pnfs_block_extent bex;
>  
> -		memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
> -		p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
> +		if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) <
> +		    sizeof(bex.vol_id)) {
> +			dprintk("%s: failed to decode device id for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  
> -		p = xdr_decode_hyper(p, &bex.foff);
> +		if (xdr_stream_decode_u64(&xdr, &bex.foff)) {
> +			dprintk("%s: failed to decode offset for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  		if (bex.foff & (block_size - 1)) {
>  			dprintk("%s: unaligned offset 0x%llx\n",
>  				__func__, bex.foff);
>  			goto fail;
>  		}
> -		p = xdr_decode_hyper(p, &bex.len);
> +
> +		if (xdr_stream_decode_u64(&xdr, &bex.len)) {
> +			dprintk("%s: failed to decode length for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  		if (bex.len & (block_size - 1)) {
>  			dprintk("%s: unaligned length 0x%llx\n",
>  				__func__, bex.foff);
>  			goto fail;
>  		}
> -		p = xdr_decode_hyper(p, &bex.soff);
> +
> +		if (xdr_stream_decode_u64(&xdr, &bex.soff)) {
> +			dprintk("%s: failed to decode soffset for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  		if (bex.soff & (block_size - 1)) {
>  			dprintk("%s: unaligned disk offset 0x%llx\n",
>  				__func__, bex.soff);
>  			goto fail;
>  		}
> -		bex.es = be32_to_cpup(p++);
> +
> +		if (xdr_stream_decode_u32(&xdr, &bex.es)) {
> +			dprintk("%s: failed to decode estate for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  		if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
>  			dprintk("%s: incorrect extent state %d\n",
>  				__func__, bex.es);
> @@ -175,18 +208,27 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  }
>  
>  int
> -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> -		u32 block_size)
> +nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len,
> +			       struct iomap **iomapp, u32 block_size)
>  {
> +	struct xdr_stream xdr;
>  	struct iomap *iomaps;
>  	u32 nr_iomaps, expected, i;
> +	char scratch[sizeof(u64)];
>  
>  	if (len < sizeof(u32)) {
>  		dprintk("%s: extent array too small: %u\n", __func__, len);
>  		return -EINVAL;
>  	}
>  
> -	nr_iomaps = be32_to_cpup(p++);
> +	xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
> +	xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
> +
> +	if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
> +		dprintk("%s: failed to decode extent array length\n", __func__);
> +		return -EINVAL;
> +	}
> +
>  	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
>  	if (len != expected) {
>  		dprintk("%s: extent array size mismatch: %u/%u\n",
> @@ -203,14 +245,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  	for (i = 0; i < nr_iomaps; i++) {
>  		u64 val;
>  
> -		p = xdr_decode_hyper(p, &val);
> +		if (xdr_stream_decode_u64(&xdr, &val)) {
> +			dprintk("%s: failed to decode offset for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  		if (val & (block_size - 1)) {
>  			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
>  			goto fail;
>  		}
>  		iomaps[i].offset = val;
>  
> -		p = xdr_decode_hyper(p, &val);
> +		if (xdr_stream_decode_u64(&xdr, &val)) {
> +			dprintk("%s: failed to decode length for entry %u\n",
> +				__func__, i);
> +			goto fail;
> +		}
>  		if (val & (block_size - 1)) {
>  			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
>  			goto fail;
> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
> index bc5166bfe46b..c4c8139b8e96 100644
> --- a/fs/nfsd/blocklayoutxdr.h
> +++ b/fs/nfsd/blocklayoutxdr.h
> @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
>  		struct nfsd4_getdeviceinfo *gdp);
>  __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>  		struct nfsd4_layoutget *lgp);
> -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> -		u32 block_size);
> -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> -		u32 block_size);
> +int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len,
> +		struct iomap **iomapp, u32 block_size);
> +int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len,
> +		struct iomap **iomapp, u32 block_size);
>  
>  #endif /* _NFSD_BLOCKLAYOUTXDR_H */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 5a93a5db4fb0..81f42dc75b95 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -592,11 +592,8 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
>  
>  	if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0)
>  		return nfserr_bad_xdr;
> -	if (lcp->lc_up_len > 0) {
> -		lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len);
> -		if (!lcp->lc_up_layout)
> -			return nfserr_bad_xdr;
> -	}
> +	if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len))
> +		return nfserr_bad_xdr;
>  
>  	return nfs_ok;
>  }
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index 846ab6df9d48..8516a1a6b46d 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -492,7 +492,7 @@ struct nfsd4_layoutcommit {
>  	struct timespec64	lc_mtime;	/* request */
>  	u32			lc_layout_type;	/* request */
>  	u32			lc_up_len;	/* layout length */
> -	void			*lc_up_layout;	/* decoded by callback */
> +	struct xdr_buf		lc_up_layout;	/* request, decoded by callback */
>  	u32			lc_size_chg;	/* boolean for response */
>  	u64			lc_newsize;	/* response */
>  };

Thanks for the suggestion, Sergey!

Note the MAINTAINERS entry for NFSD:

$ scripts/get_maintainer.pl fs/nfsd/vfs.c
Chuck Lever <chuck.lever@oracle.com> (maintainer:KERNEL NFSD, SUNRPC,
AND LOCKD SERVERS)
Jeff Layton <jlayton@kernel.org> (maintainer:KERNEL NFSD, SUNRPC, AND
LOCKD SERVERS)
Neil Brown <neilb@suse.de> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS)
Olga Kornievskaia <okorniev@redhat.com> (reviewer:KERNEL NFSD, SUNRPC,
AND LOCKD SERVERS)
Dai Ngo <Dai.Ngo@oracle.com> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD
SERVERS)
Tom Talpey <tom@talpey.com> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD
SERVERS)
linux-nfs@vger.kernel.org (open list:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS)
linux-kernel@vger.kernel.org (open list)
KERNEL NFSD, SUNRPC, AND LOCKD SERVERS status: Supported

In particular, Dai is looking at the Linux NFS server's pNFS with iSCSI
implementation right at the moment and might have some thoughts about
expanding the number of extents in block layouts.

Can you repost your patch with the current reviewers and maintainers
copied as appropriate?


-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-04 13:07 [PATCH] nfsd: Implement large extent array support in pNFS Sergey Bashirov
  2025-06-04 14:10 ` Chuck Lever
@ 2025-06-04 14:54 ` Christoph Hellwig
  2025-06-10  0:36   ` Sergey Bashirov
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2025-06-04 14:54 UTC (permalink / raw)
  To: Sergey Bashirov
  Cc: J . Bruce Fields, Chuck Lever, linux-nfs, Konstantin Evtushenko

On Wed, Jun 04, 2025 at 04:07:08PM +0300, Sergey Bashirov wrote:
> When pNFS client in block layout mode sends layoutcommit RPC to MDS,
> a variable length array of modified extents is supplied within request.
> This patch allows NFS server to accept such extent arrays if they do not
> fit within single memory page.

How did you manage to trigger such a workload?

Also you patch doesn't apply to current mainline.

>  	struct iomap *iomaps;
>  	int nr_iomaps;
>  
> -	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> -			lcp->lc_up_len, &iomaps, i_blocksize(inode));
> +	nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout,
> +						    lcp->lc_up_len,
> +						    &iomaps,
> +						    i_blocksize(inode));

Please drop all the spurious reformatting to a harder to read style.

> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
>  }
>  
>  int
> -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> -		u32 block_size)
> +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len,
> +				struct iomap **iomapp, u32 block_size)
>  {

So if you pass the xdr_buf that already has the length, can't we drop
the len argument?

> +	struct xdr_stream xdr;

Also I'm not the expert on the xdr_stream API, but instead of setting
up a sub-buffer here shouldn't we reuse that from the actual XDR
decoding stage, and if that can't work (Chuck is probably a better
source for know how on that then me), at least initialize it in the
main nfsd layoutcommit handler rather than in the layout drivers.
me we'd probably want that initialized in the core nfsd code and not
the layout driver.

> +		if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) <

Overly long line.

> +	if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len))

Same.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-04 14:54 ` Christoph Hellwig
@ 2025-06-10  0:36   ` Sergey Bashirov
  2025-06-10  5:39     ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Sergey Bashirov @ 2025-06-10  0:36 UTC (permalink / raw)
  To: Christoph Hellwig, Chuck Lever
  Cc: J . Bruce Fields, Konstantin Evtushenko, linux-nfs

On Wed, Jun 04, 2025 at 10:10:15AM -0400, Chuck Lever wrote:
> Can you repost your patch with the current reviewers and maintainers
> copied as appropriate?

Sorry for the newbie mistake, this is the first patch I'm sending
upstream. Yes, I will send patch v2 with the right reviewers and
maintainers.

On Wed, Jun 04, 2025 at 07:54:31AM -0700, Christoph Hellwig wrote:
> On Wed, Jun 04, 2025 at 04:07:08PM +0300, Sergey Bashirov wrote:
> > When pNFS client in block layout mode sends layoutcommit RPC to MDS,
> > a variable length array of modified extents is supplied within request.
> > This patch allows NFS server to accept such extent arrays if they do not
> > fit within single memory page.
>
> How did you manage to trigger such a workload?

Together with Konstantin we spent a lot of time enabling the pNFS block
volume setup. We have SDS that can attach virtual block devices via
vhost-user-blk to virtual machines. And we researched the way to create
parallel or distributed file system on top of this SDS. From this point
of view, pNFS block volume layout architecture looks quite suitable. So,
we created several VMs, configured pNFS and started testing. In fact,
during our extensive testing, we encountered a variety of issues including
deadlocks, livelocks, and corrupted files, which we eventually fixed.
Now we have a working setup and we would like to clean up the code and
contribute it.

> Also you patch doesn't apply to current mainline.

We will use the git repository link for NFSD from the MAINTAINERS file and
"nfsd-next" branch to work on top of it. Please let me know if this is the
wrong repository or branch in our case.

> > +++ b/fs/nfsd/blocklayoutxdr.c
> > @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
> >  }
> >
> >  int
> > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> > -		u32 block_size)
> > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len,
> > +				struct iomap **iomapp, u32 block_size)
> >  {
>
> So if you pass the xdr_buf that already has the length, can't we drop
> the len argument?

Thanks for the suggestions! I will rework it in patch v2 to get rid of
the len argument and the lc_up_len field.

> > +	struct xdr_stream xdr;
>
> Also I'm not the expert on the xdr_stream API, but instead of setting
> up a sub-buffer here shouldn't we reuse that from the actual XDR
> decoding stage, and if that can't work (Chuck is probably a better
> source for know how on that then me), at least initialize it in the
> main nfsd layoutcommit handler rather than in the layout drivers.
> me we'd probably want that initialized in the core nfsd code and not
> the layout driver.

As for the sub-buffer, the xdr_buf structure is initialized in the core
nfsd code to point only to the "opaque" field of the "layoutupdate4"
structure. Since this field is specific to each layout driver, its
xdr_stream is created on demand inside the field handler. For example,
the "opaque" field is not used in the file layouts. Do we really need to
expose the xdr_stream outside the field handler? Probably not. I also
checked how this is implemented in the nfs client code and found that
xdr_stream is created in a similar way inside the layout driver. Below
I have outlined some thoughts on why implemented it this way. Please
correct me if I missed anything.

Let's say we have a layout commit with 1000 extents (in practice we
observed much more). The size of each block extent structure is 44 bytes,
and we have another 4 bytes for the total number of extents. So, the
"opaque" field of the "layoutupdate4" structure has a size of 44004 bytes.
In this case, we cannot simply borrow a pointer to the xdr internal page
or the scratch buffer using xdr_inline_decode(). What options do we have?

1. Allocate a large enough memory buffer and copy the "opaque" field into
    it. But I think an extra copy of a large field is not what we prefer.

2. When RPC is received, nfsd_dispatch() first decodes the entire compound
    request and only then processes each operation. Yes, we can create a new
    callback in the layout driver interface to decode the "opaque" field
    during the decoding phase and use the actual xdr stream of the request.
    What I don't like here is that the layout driver is forced to parse a
    large data buffer before general checks are done (sequence ID, file
    handler, state ID, range, grace period, etc.). This opens up
    opportunities to abuse the server by sending invalid layout commits with
    the maximum possible number of extents (RPC can be up to 1MB).

3. It is also possible to store only the position of the "opaque" field in
    the actual xdr stream of the request during the decoding phase and defer
    large data buffer parsing until the processing phase. This is what the
    original code does, and this patch takes it in the same direction.

--
Sergey Bashirov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-10  0:36   ` Sergey Bashirov
@ 2025-06-10  5:39     ` Christoph Hellwig
  2025-06-10 15:24       ` Sergey Bashirov
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2025-06-10  5:39 UTC (permalink / raw)
  To: Sergey Bashirov
  Cc: Christoph Hellwig, Chuck Lever, J . Bruce Fields,
	Konstantin Evtushenko, linux-nfs

On Tue, Jun 10, 2025 at 03:36:49AM +0300, Sergey Bashirov wrote:
> On Wed, Jun 04, 2025 at 07:54:31AM -0700, Christoph Hellwig wrote:
> > On Wed, Jun 04, 2025 at 04:07:08PM +0300, Sergey Bashirov wrote:
> > > When pNFS client in block layout mode sends layoutcommit RPC to MDS,
> > > a variable length array of modified extents is supplied within request.
> > > This patch allows NFS server to accept such extent arrays if they do not
> > > fit within single memory page.
> > 
> > How did you manage to trigger such a workload?
> 
> Together with Konstantin we spent a lot of time enabling the pNFS block
> volume setup. We have SDS that can attach virtual block devices via
> vhost-user-blk to virtual machines. And we researched the way to create
> parallel or distributed file system on top of this SDS. From this point
> of view, pNFS block volume layout architecture looks quite suitable. So,
> we created several VMs, configured pNFS and started testing. In fact,
> during our extensive testing, we encountered a variety of issues including
> deadlocks, livelocks, and corrupted files, which we eventually fixed.
> Now we have a working setup and we would like to clean up the code and
> contribute it.

Can you share your reproducer scripts for client and server?

Btw, also as a little warning:  the current pNFS code mean any client
can corrupt the XFS metadata.  If you want to actually use the code
in production you'll probably want to figure out a way to either use
the RT device for exposed data (should be easy, but the RT allocator
sucks..), or find a way to otherwise restrict clients from overwriting
metadata.

> > Also you patch doesn't apply to current mainline.
> 
> We will use the git repository link for NFSD from the MAINTAINERS file and
> "nfsd-next" branch to work on top of it. Please let me know if this is the
> wrong repository or branch in our case.

I guess that's generally fine, but around a -rc1 release it's a bit
cumbersome.

> As for the sub-buffer, the xdr_buf structure is initialized in the core
> nfsd code to point only to the "opaque" field of the "layoutupdate4"
> structure. Since this field is specific to each layout driver, its
> xdr_stream is created on demand inside the field handler. For example,
> the "opaque" field is not used in the file layouts. Do we really need to
> expose the xdr_stream outside the field handler? Probably not. I also
> checked how this is implemented in the nfs client code and found that
> xdr_stream is created in a similar way inside the layout driver. Below
> I have outlined some thoughts on why implemented it this way. Please
> correct me if I missed anything.

Well, the fields are opaque, but everyone has to either decode (or
ignore it).  So having common setup sounds useful.

> 1. Allocate a large enough memory buffer and copy the "opaque" field into
>    it. But I think an extra copy of a large field is not what we prefer.

Agreed.

> 2. When RPC is received, nfsd_dispatch() first decodes the entire compound
>    request and only then processes each operation. Yes, we can create a new
>    callback in the layout driver interface to decode the "opaque" field
>    during the decoding phase and use the actual xdr stream of the request.
>    What I don't like here is that the layout driver is forced to parse a
>    large data buffer before general checks are done (sequence ID, file
>    handler, state ID, range, grace period, etc.). This opens up
>    opportunities to abuse the server by sending invalid layout commits with
>    the maximum possible number of extents (RPC can be up to 1MB).

OTOH the same happens for parsing any other NFS compound that isn't
split into layouts, isn't it?  And we have total size limits on the
transfer.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-10  5:39     ` Christoph Hellwig
@ 2025-06-10 15:24       ` Sergey Bashirov
  2025-06-11  6:55         ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Sergey Bashirov @ 2025-06-10 15:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs

On Mon, Jun 09, 2025 at 10:39:06PM -0700, Christoph Hellwig wrote:
> On Tue, Jun 10, 2025 at 03:36:49AM +0300, Sergey Bashirov wrote:
> > Together with Konstantin we spent a lot of time enabling the pNFS block
> > volume setup. We have SDS that can attach virtual block devices via
> > vhost-user-blk to virtual machines. And we researched the way to create
> > parallel or distributed file system on top of this SDS. From this point
> > of view, pNFS block volume layout architecture looks quite suitable. So,
> > we created several VMs, configured pNFS and started testing. In fact,
> > during our extensive testing, we encountered a variety of issues including
> > deadlocks, livelocks, and corrupted files, which we eventually fixed.
> > Now we have a working setup and we would like to clean up the code and
> > contribute it.
>
> Can you share your reproducer scripts for client and server?

I will try. First of all, you need two VMs connected to the same network.
The hardest part is somehow to connect a shared block device to both VMs
with RW access. As I mentioned, we use a proprietary SDS for it. Note
that if you pass the device to the VM as virtio-blk, the client will
select the block layout driver, and if you pass the device to the VM
as virtio-scsi, the client will select the scsi layout driver.

Script I use to start VMs (both use the same parameters, only the names,
boot images and network addresses differ):
#!/bin/sh
BOOT_DISK="-drive format=qcow2,file=pnfs_server.img"
SSH_NET="-device virtio-net-pci,netdev=net0,mac=52:54:00:12:34:57
          -netdev user,id=net0,hostfwd=tcp::20001-:22"
NFS_NET="-device e1000,netdev=net1,mac=52:54:00:12:34:67
          -netdev socket,id=net1,listen=:1234"
MP="/dev/hugepages/libvirt/qemu"
VHOST_DISK="-object memory-backend-file,id=mem,size=8G,mem-path=$MP,share=on
             -numa node,memdev=mem
             -chardev socket,id=char1,path=/var/lib/storage/pnfs_server.sock
             -device vhost-user-blk-pci,id=blk1,chardev=char1,num-queues=16"
qemu-system-x86_64 -daemonize -display none -name pnfs_server \
                    -cpu host -enable-kvm -smp 8 -m 8G \
                    $BOOT_DISK $SSH_NET $VHOST_DISK $NFS_NET

The server's /etc/nfs.conf:
[nfsd]
grace-time=90
lease-time=120
vers2=n
vers3=y
vers4=n
vers4.0=n
vers4.1=n
vers4.2=y

The server's /etc/exports:
/mnt/export *(pnfs,rw,sync,insecure,no_root_squash,no_subtree_check)

Please note that the block volume layout does not support partition
tables, volume groups, RAID, etc. So you need to create XFS on the raw
block device. In my case shared virtio-blk disk is /dev/vda.
And the file system can be prepared by following these steps:
sudo mkfs -t xfs /dev/vda
sudo mkdir -p /mnt/export
sudo mount -t xfs /dev/vda /mnt/export

After these steps you can start or restart the server:
sudo systemctl restart nfs-kernel-server

On the client side, you need to have the same /dev/vda device available,
but not mounted. Additionally, you need the blkmapd service running.
Perform the following steps to mount the share:
sudo systemctl start nfs-blkmap
sudo mkdir -p /mnt/pnfs
sudo mount -t nfs4 -v -o minorversion=2,sync,hard,noatime,
                          rsize=1048576,wsize=1048576,timeo=600,
                          retrans=2 192.168.1.1:/mnt/export
                          /mnt/pnfs

This should create 2.5k extents:
fio --name=test --filename=/mnt/pnfs/test.raw --size=10M \
     --rw=randwrite --ioengine=libaio --direct=1 --bs=4k  \
     --iodepth=128 --fallocate=none

You can check it on the server side:
xfs_bmap -elpvv /mnt/export/test.raw

Troubleshooting. If any error occurs, then kernel falls back to NFSv3.
Use nfsstat or mountstats to view RPC counters. Normally the READ and
WRITE counters should be zero, and the LAYOUTGET, LAYOUTCOMMIT,
LAYOUTRETURN should increase as you work with files. If the network
connection and shared volume are working fine, then first of all check
the status of blkmapd, most probably its fault is the reason. Note that
the client code also has problems with the block extent array. Currently
the client tries to pack all the block extents it needs to commit into
one RPC. And if there are too many of them, you will see
"RPC: fragment too large" error on the server side. That's why
we set rsize and wsize to 1M for now. Another problem is that when the
extent array does not fit into a single memory page, the client code
discards the first page of encoded extents while reallocating a larger
buffer to continue layout commit encoding. So even with this patch you
may still notice that some files are not written correctly. But at least
the server shouldn't send the badxdr error on a well-formed layout commit.

> Btw, also as a little warning:  the current pNFS code mean any client
> can corrupt the XFS metadata.  If you want to actually use the code
> in production you'll probably want to figure out a way to either use
> the RT device for exposed data (should be easy, but the RT allocator
> sucks..), or find a way to otherwise restrict clients from overwriting
> metadata.

Thanks for the advice! Yes, we have had issues with XFS corruption
especially when multiple clients were writing to the same file in
parallel. Spent some time debugging layout recalls and client fencing
to figure out what happened.

> > As for the sub-buffer, the xdr_buf structure is initialized in the core
> > nfsd code to point only to the "opaque" field of the "layoutupdate4"
> > structure. Since this field is specific to each layout driver, its
> > xdr_stream is created on demand inside the field handler. For example,
> > the "opaque" field is not used in the file layouts. Do we really need to
> > expose the xdr_stream outside the field handler? Probably not. I also
> > checked how this is implemented in the nfs client code and found that
> > xdr_stream is created in a similar way inside the layout driver. Below
> > I have outlined some thoughts on why implemented it this way. Please
> > correct me if I missed anything.
>
> Well, the fields are opaque, but everyone has to either decode (or
> ignore it).  So having common setup sounds useful.
>
> > 2. When RPC is received, nfsd_dispatch() first decodes the entire compound
> >    request and only then processes each operation. Yes, we can create a new
> >    callback in the layout driver interface to decode the "opaque" field
> >    during the decoding phase and use the actual xdr stream of the request.
> >    What I don't like here is that the layout driver is forced to parse a
> >    large data buffer before general checks are done (sequence ID, file
> >    handler, state ID, range, grace period, etc.). This opens up
> >    opportunities to abuse the server by sending invalid layout commits with
> >    the maximum possible number of extents (RPC can be up to 1MB).
>
> OTOH the same happens for parsing any other NFS compound that isn't
> split into layouts, isn't it?  And we have total size limits on the
> transfer.

I agree, one large request and 1000 small requests look the same on the
wire. So, setting up an xdr_stream at a higher level requires adding it
to the nfsd4_layoutcommit strucutre. Either as a substructure, which will
significantly increase the overall size of the layout commit argument, or
as a pointer, which will require some memory allocation and deallocation
logic. Also, in the core nfsd code we don't know the sufficient scratch
buffer size for a particular layout driver, most likely we will allocate
a page for it. This all seems a bit overengineered compared to two local
variables on the stack. I will think about it a little more.

--
Sergey Bashirov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-10 15:24       ` Sergey Bashirov
@ 2025-06-11  6:55         ` Christoph Hellwig
  2025-06-11 12:19           ` Sergey Bashirov
  2025-06-11 13:53           ` Chuck Lever
  0 siblings, 2 replies; 11+ messages in thread
From: Christoph Hellwig @ 2025-06-11  6:55 UTC (permalink / raw)
  To: Sergey Bashirov
  Cc: Christoph Hellwig, Chuck Lever, J . Bruce Fields,
	Konstantin Evtushenko, linux-nfs

On Tue, Jun 10, 2025 at 06:24:03PM +0300, Sergey Bashirov wrote:
> On Mon, Jun 09, 2025 at 10:39:06PM -0700, Christoph Hellwig wrote:
> > On Tue, Jun 10, 2025 at 03:36:49AM +0300, Sergey Bashirov wrote:
> > > Together with Konstantin we spent a lot of time enabling the pNFS block
> > > volume setup. We have SDS that can attach virtual block devices via
> > > vhost-user-blk to virtual machines. And we researched the way to create
> > > parallel or distributed file system on top of this SDS. From this point
> > > of view, pNFS block volume layout architecture looks quite suitable. So,
> > > we created several VMs, configured pNFS and started testing. In fact,
> > > during our extensive testing, we encountered a variety of issues including
> > > deadlocks, livelocks, and corrupted files, which we eventually fixed.
> > > Now we have a working setup and we would like to clean up the code and
> > > contribute it.
> > 
> > Can you share your reproducer scripts for client and server?
> 
> I will try. First of all, you need two VMs connected to the same network.
> The hardest part is somehow to connect a shared block device to both VMs
> with RW access.

I know the basic setup :)

> On the client side, you need to have the same /dev/vda device available,
> but not mounted. Additionally, you need the blkmapd service running.

blkmapd is only needed for the block layout, which should generally be
avoided as it can't be used reliably because working fencing is
almost impossible.

> This should create 2.5k extents:
> fio --name=test --filename=/mnt/pnfs/test.raw --size=10M \
>     --rw=randwrite --ioengine=libaio --direct=1 --bs=4k  \
>     --iodepth=128 --fallocate=none

Thanks!  We should find a way to wire up the test coverage
somewhere, e.g. xfstests.

> Troubleshooting. If any error occurs, then kernel falls back to NFSv3.

That should really be NFSv4.

> the client code also has problems with the block extent array. Currently
> the client tries to pack all the block extents it needs to commit into
> one RPC. And if there are too many of them, you will see
> "RPC: fragment too large" error on the server side. That's why
> we set rsize and wsize to 1M for now.

We'll really need to fix the client to split when going over the maximum
compoung size.

> Another problem is that when the
> extent array does not fit into a single memory page, the client code
> discards the first page of encoded extents while reallocating a larger
> buffer to continue layout commit encoding. So even with this patch you
> may still notice that some files are not written correctly. But at least
> the server shouldn't send the badxdr error on a well-formed layout commit.

Eww, we'll need to fix that as well.  Would be good to have a reproducer
for that case as well.

> > Btw, also as a little warning:  the current pNFS code mean any client
> > can corrupt the XFS metadata.  If you want to actually use the code
> > in production you'll probably want to figure out a way to either use
> > the RT device for exposed data (should be easy, but the RT allocator
> > sucks..), or find a way to otherwise restrict clients from overwriting
> > metadata.
> 
> Thanks for the advice! Yes, we have had issues with XFS corruption
> especially when multiple clients were writing to the same file in
> parallel. Spent some time debugging layout recalls and client fencing
> to figure out what happened.

Normal operation should not cause that, what did you see there?

I mean a malicious client targeting metadata outside it's layout.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-11  6:55         ` Christoph Hellwig
@ 2025-06-11 12:19           ` Sergey Bashirov
  2025-06-12  6:33             ` Christoph Hellwig
  2025-06-11 13:53           ` Chuck Lever
  1 sibling, 1 reply; 11+ messages in thread
From: Sergey Bashirov @ 2025-06-11 12:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs

On Tue, Jun 10, 2025 at 11:55:09PM -0700, Christoph Hellwig wrote:
> On Tue, Jun 10, 2025 at 06:24:03PM +0300, Sergey Bashirov wrote:
> > the client code also has problems with the block extent array. Currently
> > the client tries to pack all the block extents it needs to commit into
> > one RPC. And if there are too many of them, you will see
> > "RPC: fragment too large" error on the server side. That's why
> > we set rsize and wsize to 1M for now.
>
> We'll really need to fix the client to split when going over the maximum
> compoung size.

Yes, working on patches to send for review.

> > Another problem is that when the
> > extent array does not fit into a single memory page, the client code
> > discards the first page of encoded extents while reallocating a larger
> > buffer to continue layout commit encoding. So even with this patch you
> > may still notice that some files are not written correctly. But at least
> > the server shouldn't send the badxdr error on a well-formed layout commit.
>
> Eww, we'll need to fix that as well.  Would be good to have a reproducer
> for that case as well.

Will prepare and send patches too. The reproducer should be the same
as I send for the server. Just try to check what FIO has written with
the additional option --verify=crc32c.

> > Thanks for the advice! Yes, we have had issues with XFS corruption
> > especially when multiple clients were writing to the same file in
> > parallel. Spent some time debugging layout recalls and client fencing
> > to figure out what happened.
>
> Normal operation should not cause that, what did you see there?

I think, this is not an NFS implementation issue, but rather a question
of how to properly implement the client fencing. In a distributed
storage system, there is a delay between the time NFS server requests
a blocking of writes to a shared volume for a particular client and the
time that blocking takes effect. If we choose an optimistic approach and
assume that fencing is done by simply sending a request (without waiting
for actual processing by the underlying storage system), then we might
end up in the following situation.

Let's think of layoutget as a byte range locking mechanism in terms of
writing to a single file from multiple clients. First of all, a client
writing without O_DIRECT through the page cache will delay processing
of the layout recall for too long if its user-space application really
writes a lot. As a consequences we observe significant performance
degradation, and sometimes the server decides that the client is not
responding at all and tries to fence it to allow the second client to
get the lock. At this moment we still have the first client writing, and
the server releasing the xfs_lease associated with the layout of the
first client. And if we are really unlucky, XFS might reallocate extents,
so the first client will be writing outside the file. And if we are
really, really unlucky, XFS might put some metadata there as well.

--
Sergey Bashirov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-11  6:55         ` Christoph Hellwig
  2025-06-11 12:19           ` Sergey Bashirov
@ 2025-06-11 13:53           ` Chuck Lever
  1 sibling, 0 replies; 11+ messages in thread
From: Chuck Lever @ 2025-06-11 13:53 UTC (permalink / raw)
  To: Christoph Hellwig, Sergey Bashirov
  Cc: J . Bruce Fields, Konstantin Evtushenko, linux-nfs

On 6/11/25 2:55 AM, Christoph Hellwig wrote:
> On Tue, Jun 10, 2025 at 06:24:03PM +0300, Sergey Bashirov wrote:
>>
>> This should create 2.5k extents:
>> fio --name=test --filename=/mnt/pnfs/test.raw --size=10M \
>>     --rw=randwrite --ioengine=libaio --direct=1 --bs=4k  \
>>     --iodepth=128 --fallocate=none
> 
> Thanks!  We should find a way to wire up the test coverage
> somewhere, e.g. xfstests.

I implemented support for setting up pNFS with iSCSI layouts in kdevops
last summer. Unfortunately since then I've been distracted by trying to
find enough hardware resources to add it to the regular matrix of tests.

-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-11 12:19           ` Sergey Bashirov
@ 2025-06-12  6:33             ` Christoph Hellwig
  2025-06-12  8:13               ` Sergey Bashirov
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2025-06-12  6:33 UTC (permalink / raw)
  To: Sergey Bashirov
  Cc: Christoph Hellwig, Chuck Lever, J . Bruce Fields,
	Konstantin Evtushenko, linux-nfs

On Wed, Jun 11, 2025 at 03:19:29PM +0300, Sergey Bashirov wrote:
> > Normal operation should not cause that, what did you see there?
> 
> I think, this is not an NFS implementation issue, but rather a question
> of how to properly implement the client fencing. In a distributed
> storage system, there is a delay between the time NFS server requests
> a blocking of writes to a shared volume for a particular client and the
> time that blocking takes effect. If we choose an optimistic approach and
> assume that fencing is done by simply sending a request (without waiting
> for actual processing by the underlying storage system), then we might
> end up in the following situation.

I guess this is using block layout and your own fencing?  Because
with the SCSI layout we fence right from the kernel path before
force returning the layout.  The fact that block layout can't do
reliable fencing is the reason why I came up with the SCSI layout,
that can.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nfsd: Implement large extent array support in pNFS
  2025-06-12  6:33             ` Christoph Hellwig
@ 2025-06-12  8:13               ` Sergey Bashirov
  0 siblings, 0 replies; 11+ messages in thread
From: Sergey Bashirov @ 2025-06-12  8:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chuck Lever, J . Bruce Fields, Konstantin Evtushenko, linux-nfs

On Wed, Jun 11, 2025 at 11:33:27PM -0700, Christoph Hellwig wrote:
> On Wed, Jun 11, 2025 at 03:19:29PM +0300, Sergey Bashirov wrote:
> > > Normal operation should not cause that, what did you see there?
> >
> > I think, this is not an NFS implementation issue, but rather a question
> > of how to properly implement the client fencing. In a distributed
> > storage system, there is a delay between the time NFS server requests
> > a blocking of writes to a shared volume for a particular client and the
> > time that blocking takes effect. If we choose an optimistic approach and
> > assume that fencing is done by simply sending a request (without waiting
> > for actual processing by the underlying storage system), then we might
> > end up in the following situation.
>
> I guess this is using block layout and your own fencing?  Because
> with the SCSI layout we fence right from the kernel path before
> force returning the layout.  The fact that block layout can't do
> reliable fencing is the reason why I came up with the SCSI layout,
> that can.

Yes, you are right.

By the way, even with SCSI Persistent Reservations the fencing is not
entirely clean and simple. We tried a third party enterprise storage
system to test the scsi layout. But it seems that SCSI PR implementation
there is imperfect. We occasionally observed PR_KEYs being erroneously
revoked by the storage system. But the NFS part of this setup worked fine.

--
Sergey Bashirov

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-06-12  8:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-04 13:07 [PATCH] nfsd: Implement large extent array support in pNFS Sergey Bashirov
2025-06-04 14:10 ` Chuck Lever
2025-06-04 14:54 ` Christoph Hellwig
2025-06-10  0:36   ` Sergey Bashirov
2025-06-10  5:39     ` Christoph Hellwig
2025-06-10 15:24       ` Sergey Bashirov
2025-06-11  6:55         ` Christoph Hellwig
2025-06-11 12:19           ` Sergey Bashirov
2025-06-12  6:33             ` Christoph Hellwig
2025-06-12  8:13               ` Sergey Bashirov
2025-06-11 13:53           ` Chuck Lever

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox