public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] pnfs/filelayout: handle data server op_status errors
@ 2025-12-09 13:56 Robert Milkowski
  2025-12-09 15:33 ` Trond Myklebust
  0 siblings, 1 reply; 3+ messages in thread
From: Robert Milkowski @ 2025-12-09 13:56 UTC (permalink / raw)
  To: trondmy; +Cc: anna, linux-nfs, linux-kernel, Robert Milkowski

Data servers can return NFS-level status in op_status, but the file layout
driver only looked at RPC transport errors. That means session errors,
layout-invalidating statuses, and retry hints from the DS can be ignored,
leading to missed session recovery, stale layouts, or failed retries.

Pass the DS op_status into the async error handler and handle the same set
of NFS status codes as flexfiles (see commit 38074de35b01,
"NFSv4/flexfiles: Fix handling of NFS level errors in I/O"). Wire the
read/write/commit callbacks to propagate op_status so the file layout driver
can invalidate layouts, trigger session recovery, or retry as appropriate.

Signed-off-by: Robert Milkowski <rmilkowski@gmail.com>
---
 fs/nfs/filelayout/filelayout.c | 54 ++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 5c4551117c58..2808baa19f83 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -121,6 +121,7 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
 }
 
 static int filelayout_async_handle_error(struct rpc_task *task,
+					 u32 op_status,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
 					 struct pnfs_layout_segment *lseg)
@@ -130,6 +131,48 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
 	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
 
+	if (op_status) {
+		switch (op_status) {
+		case NFS4_OK:
+		case NFS4ERR_NXIO:
+			break;
+		case NFS4ERR_BADSESSION:
+		case NFS4ERR_BADSLOT:
+		case NFS4ERR_BAD_HIGH_SLOT:
+		case NFS4ERR_DEADSESSION:
+		case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case NFS4ERR_SEQ_FALSE_RETRY:
+		case NFS4ERR_SEQ_MISORDERED:
+			dprintk("%s op_status %u, Reset session. Exchangeid "
+				"flags 0x%x\n", __func__, op_status,
+				clp->cl_exchange_flags);
+			nfs4_schedule_session_recovery(clp->cl_session,
+						      op_status);
+			goto out_retry;
+		case NFS4ERR_DELAY:
+		case NFS4ERR_GRACE:
+		case NFS4ERR_RETRY_UNCACHED_REP:
+			rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+			goto out_retry;
+		case NFS4ERR_ACCESS:
+		case NFS4ERR_PNFS_NO_LAYOUT:
+		case NFS4ERR_STALE:
+		case NFS4ERR_BADHANDLE:
+		case NFS4ERR_ISDIR:
+		case NFS4ERR_FHEXPIRED:
+		case NFS4ERR_WRONG_TYPE:
+		case NFS4ERR_NOMATCHING_LAYOUT:
+		case NFSERR_PERM:
+			dprintk("%s Invalid layout op_status %u\n", __func__,
+				op_status);
+			pnfs_destroy_layout(NFS_I(inode));
+			rpc_wake_up(&tbl->slot_tbl_waitq);
+			goto reset;
+		default:
+			goto reset;
+		}
+	}
+
 	if (task->tk_status >= 0)
 		return 0;
 
@@ -196,6 +239,8 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 			task->tk_status);
 		return -NFS4ERR_RESET_TO_MDS;
 	}
+
+out_retry:
 	task->tk_status = 0;
 	return -EAGAIN;
 }
@@ -208,7 +253,8 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
-	err = filelayout_async_handle_error(task, hdr->args.context->state,
+	err = filelayout_async_handle_error(task, hdr->res.op_status,
+					    hdr->args.context->state,
 					    hdr->ds_clp, hdr->lseg);
 
 	switch (err) {
@@ -318,7 +364,8 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
-	err = filelayout_async_handle_error(task, hdr->args.context->state,
+	err = filelayout_async_handle_error(task, hdr->res.op_status,
+					    hdr->args.context->state,
 					    hdr->ds_clp, hdr->lseg);
 
 	switch (err) {
@@ -346,7 +393,8 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
-	err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+	err = filelayout_async_handle_error(task, data->res.op_status,
+					    NULL, data->ds_clp,
 					    data->lseg);
 
 	switch (err) {

base-commit: cb015814f8b6eebcbb8e46e111d108892c5e6821
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] pnfs/filelayout: handle data server op_status errors
  2025-12-09 13:56 [PATCH] pnfs/filelayout: handle data server op_status errors Robert Milkowski
@ 2025-12-09 15:33 ` Trond Myklebust
  2026-03-23 13:03   ` Robert Milkowski
  0 siblings, 1 reply; 3+ messages in thread
From: Trond Myklebust @ 2025-12-09 15:33 UTC (permalink / raw)
  To: Robert Milkowski; +Cc: anna, linux-nfs, linux-kernel

On Tue, 2025-12-09 at 13:56 +0000, Robert Milkowski wrote:
> Data servers can return NFS-level status in op_status, but the file
> layout
> driver only looked at RPC transport errors. That means session
> errors,
> layout-invalidating statuses, and retry hints from the DS can be
> ignored,
> leading to missed session recovery, stale layouts, or failed retries.

The task->tk_status can carry NFS level status if there was no RPC
error. That's why we distinguish between task->tk_status and task-
>tk_rpc_status (the latter being guaranteed to only carry RPC errors).

IOW: is there any evidence of what you call out above?

> 
> Pass the DS op_status into the async error handler and handle the
> same set
> of NFS status codes as flexfiles (see commit 38074de35b01,
> "NFSv4/flexfiles: Fix handling of NFS level errors in I/O"). Wire the
> read/write/commit callbacks to propagate op_status so the file layout
> driver
> can invalidate layouts, trigger session recovery, or retry as
> appropriate.
> 
> Signed-off-by: Robert Milkowski <rmilkowski@gmail.com>
> ---
>  fs/nfs/filelayout/filelayout.c | 54
> ++++++++++++++++++++++++++++++++--
>  1 file changed, 51 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfs/filelayout/filelayout.c
> b/fs/nfs/filelayout/filelayout.c
> index 5c4551117c58..2808baa19f83 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -121,6 +121,7 @@ static void filelayout_reset_read(struct
> nfs_pgio_header *hdr)
>  }
>  
>  static int filelayout_async_handle_error(struct rpc_task *task,
> +					 u32 op_status,
>  					 struct nfs4_state *state,
>  					 struct nfs_client *clp,
>  					 struct pnfs_layout_segment
> *lseg)
> @@ -130,6 +131,48 @@ static int filelayout_async_handle_error(struct
> rpc_task *task,
>  	struct nfs4_deviceid_node *devid =
> FILELAYOUT_DEVID_NODE(lseg);
>  	struct nfs4_slot_table *tbl = &clp->cl_session-
> >fc_slot_table;
>  
> +	if (op_status) {
> +		switch (op_status) {
> +		case NFS4_OK:
> +		case NFS4ERR_NXIO:
> +			break;
> +		case NFS4ERR_BADSESSION:
> +		case NFS4ERR_BADSLOT:
> +		case NFS4ERR_BAD_HIGH_SLOT:
> +		case NFS4ERR_DEADSESSION:
> +		case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
> +		case NFS4ERR_SEQ_FALSE_RETRY:
> +		case NFS4ERR_SEQ_MISORDERED:
> +			dprintk("%s op_status %u, Reset session.
> Exchangeid "
> +				"flags 0x%x\n", __func__, op_status,
> +				clp->cl_exchange_flags);
> +			nfs4_schedule_session_recovery(clp-
> >cl_session,
> +						      op_status);
> +			goto out_retry;
> +		case NFS4ERR_DELAY:
> +		case NFS4ERR_GRACE:
> +		case NFS4ERR_RETRY_UNCACHED_REP:
> +			rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
> +			goto out_retry;
> +		case NFS4ERR_ACCESS:
> +		case NFS4ERR_PNFS_NO_LAYOUT:
> +		case NFS4ERR_STALE:
> +		case NFS4ERR_BADHANDLE:
> +		case NFS4ERR_ISDIR:
> +		case NFS4ERR_FHEXPIRED:
> +		case NFS4ERR_WRONG_TYPE:
> +		case NFS4ERR_NOMATCHING_LAYOUT:
> +		case NFSERR_PERM:
> +			dprintk("%s Invalid layout op_status %u\n",
> __func__,
> +				op_status);
> +			pnfs_destroy_layout(NFS_I(inode));
> +			rpc_wake_up(&tbl->slot_tbl_waitq);
> +			goto reset;
> +		default:
> +			goto reset;
> +		}
> +	}
> +
>  	if (task->tk_status >= 0)
>  		return 0;
>  
> @@ -196,6 +239,8 @@ static int filelayout_async_handle_error(struct
> rpc_task *task,
>  			task->tk_status);
>  		return -NFS4ERR_RESET_TO_MDS;
>  	}
> +
> +out_retry:
>  	task->tk_status = 0;
>  	return -EAGAIN;
>  }
> @@ -208,7 +253,8 @@ static int filelayout_read_done_cb(struct
> rpc_task *task,
>  	int err;
>  
>  	trace_nfs4_pnfs_read(hdr, task->tk_status);
> -	err = filelayout_async_handle_error(task, hdr->args.context-
> >state,
> +	err = filelayout_async_handle_error(task, hdr-
> >res.op_status,
> +					    hdr->args.context-
> >state,
>  					    hdr->ds_clp, hdr->lseg);
>  
>  	switch (err) {
> @@ -318,7 +364,8 @@ static int filelayout_write_done_cb(struct
> rpc_task *task,
>  	int err;
>  
>  	trace_nfs4_pnfs_write(hdr, task->tk_status);
> -	err = filelayout_async_handle_error(task, hdr->args.context-
> >state,
> +	err = filelayout_async_handle_error(task, hdr-
> >res.op_status,
> +					    hdr->args.context-
> >state,
>  					    hdr->ds_clp, hdr->lseg);
>  
>  	switch (err) {
> @@ -346,7 +393,8 @@ static int filelayout_commit_done_cb(struct
> rpc_task *task,
>  	int err;
>  
>  	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
> -	err = filelayout_async_handle_error(task, NULL, data-
> >ds_clp,
> +	err = filelayout_async_handle_error(task, data-
> >res.op_status,
> +					    NULL, data->ds_clp,
>  					    data->lseg);
>  
>  	switch (err) {
> 
> base-commit: cb015814f8b6eebcbb8e46e111d108892c5e6821

I'd be willing to take something like the above as a cleanup, assuming
that it replaces the existing handling of NFSv4.1 errors using task-
>tk_status instead of just adding new cases.

However I'd want to see evidence that the resulting patch has been
thoroughly tested before submission, because I currently have no way to
test it myself.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@kernel.org, trond.myklebust@hammerspace.com

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] pnfs/filelayout: handle data server op_status errors
  2025-12-09 15:33 ` Trond Myklebust
@ 2026-03-23 13:03   ` Robert Milkowski
  0 siblings, 0 replies; 3+ messages in thread
From: Robert Milkowski @ 2026-03-23 13:03 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: anna, linux-nfs, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2809 bytes --]

Hi Trond,

Apologies for the very long delay in responding.

You're right that v1 overstated the problem. filelayout already gets
NFS-level errors in `task->tk_status` when there is no RPC transport
failure. I've respun this as a cleanup of the existing filelayout NFSv4.1
error handling rather than as a parallel op_status-only path.

The new version:
- when `tk_rpc_status == 0`, keeps the existing negative `task->tk_status`
  handling and only uses `op_status` once `task->tk_status` is non-negative;
- keeps transport errors on `task->tk_rpc_status`;
- keeps malformed-reply/decode failures on the existing DS-unavailable path;
- preserves the existing DS connection error behaviour (reset to MDS rather
  than retrying the same DS RPC);
- handles `NFS4ERR_NOMATCHING_LAYOUT` through the same invalidate/reset path.

For evidence, the original investigation had two pieces:
- the blocked task was in the filelayout read path
  (`pnfs_update_layout+0x5e2/0xf50`, `fl_pnfs_update_layout.constprop.0`,
  `filelayout_pg_init_read`), and the client was using
  `nfs_layout_nfsv41_files` (`lsmod | grep -i layout` showed
  `nfs_layout_nfsv41_files`);
- in the packet capture, there are 96 DS GETATTR replies of
  `getattr ERROR: unk 60` between 2025-10-27 11:33:18.333378 and
  2025-10-27 11:34:38.718255, from 21 different DS IPs. In
  `include/linux/nfs4.h`, `NFS4ERR_NOMATCHING_LAYOUT` is 10060, so those
  replies are consistent with the DS returning
  `NFS4ERR_NOMATCHING_LAYOUT`, i.e. that the current layout no longer
  matches.

For example, the capture includes:
- `2025-10-27 11:33:18.333378 ... NFS reply xid 1334105934 reply ok 148 getattr ERROR: unk 60`
- `2025-10-27 11:33:18.333392 ... NFS reply xid 2507704000 reply ok 148 getattr ERROR: unk 60`
- `2025-10-27 11:34:38.718255 ... NFS reply xid 860829781 reply ok 148 getattr ERROR: unk 60`

The original incident notes also recorded the same symptom on multiple jobs
across multiple clients in the same time window, so this did not look like
an isolated single-host event.

That is what led me to respin this along the same NFS-status vs
RPC-status split as 38074de35b01, while keeping filelayout's existing
behaviour for local/decode and DS connection failures.

Testing:
- applies cleanly on current upstream `master`
  (`0e4f8f1a3d081e834be5fd0a62bdb2554fadd307`);
- build-tested there with `x86_64_defconfig`,
  `CONFIG_NFS_FS=m`, `CONFIG_NFS_V4=m`, `CONFIG_PNFS_FILE_LAYOUT=m`,
  followed by `make -j4`;
- operationally, we've been running a local version with the same
  `tk_status`/`tk_rpc_status`/`op_status` split and
  `NFS4ERR_NOMATCHING_LAYOUT` handling on multiple servers for several
  months, and haven't seen the original hang recur.

See the attached respun v2 patch.


[-- Attachment #2: 0001-pnfs-filelayout-handle-data-server-op_status-errors.patch --]
[-- Type: application/octet-stream, Size: 7795 bytes --]

From 1f9849a8291cee861d659eb4ea1b109d696dd3ed Mon Sep 17 00:00:00 2001
From: Robert Milkowski <rmilkowski@gmail.com>
Date: Mon, 23 Mar 2026 11:26:17 +0000
Subject: [PATCH] pnfs/filelayout: handle data server op_status errors

Data servers can return per-operation NFS status in op_status even when
the RPC transport succeeds. filelayout_async_handle_error() currently
mixes NFS-level and transport-level handling through task->tk_status.

Model this on the NFS-status vs RPC-status split in 38074de35b01
("NFSv4/flexfiles: Fix handling of NFS level errors in I/O"), while
keeping filelayout's existing behaviour for local/decode and DS
connection failures. Keep the existing negative task->tk_status handling
for successful transports, use op_status only once task->tk_status is
non-negative, and use tk_rpc_status strictly for transport errors.

Wire read/write/commit done callbacks to pass op_status into
filelayout_async_handle_error() and handle NFS4ERR_NOMATCHING_LAYOUT via
the existing layout invalidate / retry-through-MDS path.

Signed-off-by: Robert Milkowski <rmilkowski@gmail.com>
---
 fs/nfs/filelayout/filelayout.c | 141 ++++++++++++++++++++++-----------
 1 file changed, 96 insertions(+), 45 deletions(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 90a11afa5d05..f941f6fa685e 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -121,6 +121,7 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
 }
 
 static int filelayout_async_handle_error(struct rpc_task *task,
+					 u32 op_status,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
 					 struct pnfs_layout_segment *lseg)
@@ -129,50 +130,94 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	struct inode *inode = lo->plh_inode;
 	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
 	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+	int nfs_err = 0;
+
+	/* Only trust NFS status when the RPC reply was received. */
+	if (task->tk_rpc_status == 0) {
+		if (task->tk_status < 0)
+			nfs_err = task->tk_status;
+		else if (op_status)
+			nfs_err = -(int)op_status;
+	}
 
-	if (task->tk_status >= 0)
-		return 0;
-
-	switch (task->tk_status) {
-	/* DS session errors */
-	case -NFS4ERR_BADSESSION:
-	case -NFS4ERR_BADSLOT:
-	case -NFS4ERR_BAD_HIGH_SLOT:
-	case -NFS4ERR_DEADSESSION:
-	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-	case -NFS4ERR_SEQ_FALSE_RETRY:
-	case -NFS4ERR_SEQ_MISORDERED:
-		dprintk("%s ERROR %d, Reset session. Exchangeid "
-			"flags 0x%x\n", __func__, task->tk_status,
-			clp->cl_exchange_flags);
-		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+	switch (nfs_err) {
+	case -NFS4ERR_STALE:
+		nfs_err = -ESTALE;
 		break;
-	case -NFS4ERR_DELAY:
-	case -NFS4ERR_GRACE:
-		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+	case -NFS4ERR_BADHANDLE:
+		nfs_err = -EBADHANDLE;
 		break;
-	case -NFS4ERR_RETRY_UNCACHED_REP:
+	case -NFS4ERR_ISDIR:
+		nfs_err = -EISDIR;
 		break;
-	/* Invalidate Layout errors */
-	case -NFS4ERR_ACCESS:
-	case -NFS4ERR_PNFS_NO_LAYOUT:
-	case -ESTALE:           /* mapped NFS4ERR_STALE */
-	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
-	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
-	case -NFS4ERR_FHEXPIRED:
-	case -NFS4ERR_WRONG_TYPE:
-		dprintk("%s Invalid layout error %d\n", __func__,
-			task->tk_status);
-		/*
-		 * Destroy layout so new i/o will get a new layout.
-		 * Layout will not be destroyed until all current lseg
-		 * references are put. Mark layout as invalid to resend failed
-		 * i/o and all i/o waiting on the slot table to the MDS until
-		 * layout is destroyed and a new valid layout is obtained.
-		 */
-		pnfs_destroy_layout(NFS_I(inode));
-		rpc_wake_up(&tbl->slot_tbl_waitq);
-		goto reset;
+	}
+
+	/*
+	 * Preserve the old task->tk_status handling for non-NFS errors after a
+	 * successful transport. This avoids stale or partially decoded op_status
+	 * values overriding a newer local/decode failure.
+	 */
+	if (task->tk_rpc_status == 0 && task->tk_status < 0) {
+		switch (task->tk_status) {
+		case -ECONNREFUSED:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ENETUNREACH:
+		case -EIO:
+		case -ETIMEDOUT:
+		case -EPIPE:
+		case -EPROTO:
+		case -ENODEV:
+			dprintk("%s DS reply processing error %d\n", __func__,
+				task->tk_status);
+			goto ds_unavailable;
+		}
+	}
+
+	if (nfs_err < 0) {
+		switch (nfs_err) {
+		/* DS session errors */
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			dprintk("%s ERROR %d, Reset session. Exchangeid "
+				"flags 0x%x\n", __func__, nfs_err,
+				clp->cl_exchange_flags);
+			nfs4_schedule_session_recovery(clp->cl_session, nfs_err);
+			goto out_retry;
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_GRACE:
+			rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+			goto out_retry;
+		case -NFS4ERR_RETRY_UNCACHED_REP:
+			goto out_retry;
+		/* Invalidate Layout errors */
+		case -NFS4ERR_ACCESS:
+		case -NFS4ERR_PNFS_NO_LAYOUT:
+		case -ESTALE:           /* mapped NFS4ERR_STALE */
+		case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+		case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+		case -NFS4ERR_FHEXPIRED:
+		case -NFS4ERR_WRONG_TYPE:
+		case -NFS4ERR_NOMATCHING_LAYOUT:
+			dprintk("%s Invalid layout error %d\n", __func__,
+				nfs_err);
+			pnfs_destroy_layout(NFS_I(inode));
+			rpc_wake_up(&tbl->slot_tbl_waitq);
+			goto reset;
+		default:
+			goto reset;
+		}
+	}
+
+	if (task->tk_rpc_status == 0)
+		return 0;
+
+	switch (task->tk_rpc_status) {
 	/* RPC connection errors */
 	case -ECONNREFUSED:
 	case -EHOSTDOWN:
@@ -184,7 +229,8 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	case -EPROTO:
 	case -ENODEV:
 		dprintk("%s DS connection error %d\n", __func__,
-			task->tk_status);
+			task->tk_rpc_status);
+ds_unavailable:
 		nfs4_mark_deviceid_unavailable(devid);
 		pnfs_error_mark_layout_for_return(inode, lseg);
 		pnfs_set_lo_fail(lseg);
@@ -193,9 +239,11 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	default:
 reset:
 		dprintk("%s Retry through MDS. Error %d\n", __func__,
-			task->tk_status);
+			task->tk_rpc_status ? task->tk_rpc_status : nfs_err);
 		return -NFS4ERR_RESET_TO_MDS;
 	}
+
+out_retry:
 	task->tk_status = 0;
 	return -EAGAIN;
 }
@@ -208,7 +256,8 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
-	err = filelayout_async_handle_error(task, hdr->args.context->state,
+	err = filelayout_async_handle_error(task, hdr->res.op_status,
+					    hdr->args.context->state,
 					    hdr->ds_clp, hdr->lseg);
 
 	switch (err) {
@@ -318,7 +367,8 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
-	err = filelayout_async_handle_error(task, hdr->args.context->state,
+	err = filelayout_async_handle_error(task, hdr->res.op_status,
+					    hdr->args.context->state,
 					    hdr->ds_clp, hdr->lseg);
 
 	switch (err) {
@@ -346,7 +396,8 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
-	err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+	err = filelayout_async_handle_error(task, data->res.op_status,
+					    NULL, data->ds_clp,
 					    data->lseg);
 
 	switch (err) {
-- 
2.47.1


[-- Attachment #3: Type: text/plain, Size: 5300 bytes --]




Best regards,
 Robert Milkowski



> On 9 Dec 2025, at 15:33, Trond Myklebust <trondmy@kernel.org> wrote:
> 
> On Tue, 2025-12-09 at 13:56 +0000, Robert Milkowski wrote:
>> Data servers can return NFS-level status in op_status, but the file
>> layout
>> driver only looked at RPC transport errors. That means session
>> errors,
>> layout-invalidating statuses, and retry hints from the DS can be
>> ignored,
>> leading to missed session recovery, stale layouts, or failed retries.
> 
> The task->tk_status can carry NFS level status if there was no RPC
> error. That's why we distinguish between task->tk_status and task-
>> tk_rpc_status (the latter being guaranteed to only carry RPC errors).
> 
> IOW: is there any evidence of what you call out above?
> 
>> 
>> Pass the DS op_status into the async error handler and handle the
>> same set
>> of NFS status codes as flexfiles (see commit 38074de35b01,
>> "NFSv4/flexfiles: Fix handling of NFS level errors in I/O"). Wire the
>> read/write/commit callbacks to propagate op_status so the file layout
>> driver
>> can invalidate layouts, trigger session recovery, or retry as
>> appropriate.
>> 
>> Signed-off-by: Robert Milkowski <rmilkowski@gmail.com>
>> ---
>>  fs/nfs/filelayout/filelayout.c | 54
>> ++++++++++++++++++++++++++++++++--
>>  1 file changed, 51 insertions(+), 3 deletions(-)
>> 
>> diff --git a/fs/nfs/filelayout/filelayout.c
>> b/fs/nfs/filelayout/filelayout.c
>> index 5c4551117c58..2808baa19f83 100644
>> --- a/fs/nfs/filelayout/filelayout.c
>> +++ b/fs/nfs/filelayout/filelayout.c
>> @@ -121,6 +121,7 @@ static void filelayout_reset_read(struct
>> nfs_pgio_header *hdr)
>>  }
>>  
>>  static int filelayout_async_handle_error(struct rpc_task *task,
>> + u32 op_status,
>>   struct nfs4_state *state,
>>   struct nfs_client *clp,
>>   struct pnfs_layout_segment
>> *lseg)
>> @@ -130,6 +131,48 @@ static int filelayout_async_handle_error(struct
>> rpc_task *task,
>>   struct nfs4_deviceid_node *devid =
>> FILELAYOUT_DEVID_NODE(lseg);
>>   struct nfs4_slot_table *tbl = &clp->cl_session-
>>> fc_slot_table;
>>  
>> + if (op_status) {
>> + switch (op_status) {
>> + case NFS4_OK:
>> + case NFS4ERR_NXIO:
>> + break;
>> + case NFS4ERR_BADSESSION:
>> + case NFS4ERR_BADSLOT:
>> + case NFS4ERR_BAD_HIGH_SLOT:
>> + case NFS4ERR_DEADSESSION:
>> + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
>> + case NFS4ERR_SEQ_FALSE_RETRY:
>> + case NFS4ERR_SEQ_MISORDERED:
>> + dprintk("%s op_status %u, Reset session.
>> Exchangeid "
>> + "flags 0x%x\n", __func__, op_status,
>> + clp->cl_exchange_flags);
>> + nfs4_schedule_session_recovery(clp-
>>> cl_session,
>> +       op_status);
>> + goto out_retry;
>> + case NFS4ERR_DELAY:
>> + case NFS4ERR_GRACE:
>> + case NFS4ERR_RETRY_UNCACHED_REP:
>> + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
>> + goto out_retry;
>> + case NFS4ERR_ACCESS:
>> + case NFS4ERR_PNFS_NO_LAYOUT:
>> + case NFS4ERR_STALE:
>> + case NFS4ERR_BADHANDLE:
>> + case NFS4ERR_ISDIR:
>> + case NFS4ERR_FHEXPIRED:
>> + case NFS4ERR_WRONG_TYPE:
>> + case NFS4ERR_NOMATCHING_LAYOUT:
>> + case NFSERR_PERM:
>> + dprintk("%s Invalid layout op_status %u\n",
>> __func__,
>> + op_status);
>> + pnfs_destroy_layout(NFS_I(inode));
>> + rpc_wake_up(&tbl->slot_tbl_waitq);
>> + goto reset;
>> + default:
>> + goto reset;
>> + }
>> + }
>> +
>>   if (task->tk_status >= 0)
>>   return 0;
>>  
>> @@ -196,6 +239,8 @@ static int filelayout_async_handle_error(struct
>> rpc_task *task,
>>   task->tk_status);
>>   return -NFS4ERR_RESET_TO_MDS;
>>   }
>> +
>> +out_retry:
>>   task->tk_status = 0;
>>   return -EAGAIN;
>>  }
>> @@ -208,7 +253,8 @@ static int filelayout_read_done_cb(struct
>> rpc_task *task,
>>   int err;
>>  
>>   trace_nfs4_pnfs_read(hdr, task->tk_status);
>> - err = filelayout_async_handle_error(task, hdr->args.context-
>>> state,
>> + err = filelayout_async_handle_error(task, hdr-
>>> res.op_status,
>> +     hdr->args.context-
>>> state,
>>       hdr->ds_clp, hdr->lseg);
>>  
>>   switch (err) {
>> @@ -318,7 +364,8 @@ static int filelayout_write_done_cb(struct
>> rpc_task *task,
>>   int err;
>>  
>>   trace_nfs4_pnfs_write(hdr, task->tk_status);
>> - err = filelayout_async_handle_error(task, hdr->args.context-
>>> state,
>> + err = filelayout_async_handle_error(task, hdr-
>>> res.op_status,
>> +     hdr->args.context-
>>> state,
>>       hdr->ds_clp, hdr->lseg);
>>  
>>   switch (err) {
>> @@ -346,7 +393,8 @@ static int filelayout_commit_done_cb(struct
>> rpc_task *task,
>>   int err;
>>  
>>   trace_nfs4_pnfs_commit_ds(data, task->tk_status);
>> - err = filelayout_async_handle_error(task, NULL, data-
>>> ds_clp,
>> + err = filelayout_async_handle_error(task, data-
>>> res.op_status,
>> +     NULL, data->ds_clp,
>>       data->lseg);
>>  
>>   switch (err) {
>> 
>> base-commit: cb015814f8b6eebcbb8e46e111d108892c5e6821
> 
> I'd be willing to take something like the above as a cleanup, assuming
> that it replaces the existing handling of NFSv4.1 errors using task-
>> tk_status instead of just adding new cases.
> 
> However I'd want to see evidence that the resulting patch has been
> thoroughly tested before submission, because I currently have no way to
> test it myself.
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trondmy@kernel.org, trond.myklebust@hammerspace.com


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-03-23 13:04 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-09 13:56 [PATCH] pnfs/filelayout: handle data server op_status errors Robert Milkowski
2025-12-09 15:33 ` Trond Myklebust
2026-03-23 13:03   ` Robert Milkowski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox